illumos-gate Wdiff usr/src/uts/common/fs/nfs/nfs4_vnops.c

Print this page

3484 enhance and document tail follow support
Reviewed by: Joshua M. Clulow <jmc@joyent.com>

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/nfs/nfs4_vnops.c
          +++ new/usr/src/uts/common/fs/nfs/nfs4_vnops.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23   23   * Use is subject to license terms.

↓ open down ↓

23 lines elided

↑ open up ↑

  24   24   */
  25   25  /*
  26   26   * Copyright 2012 Nexenta Systems, Inc. All rights reserved.
  27   27   */
  28   28  
  29   29  /*
  30   30   *      Copyright 1983,1984,1985,1986,1987,1988,1989 AT&T.
  31   31   *      All Rights Reserved
  32   32   */
  33   33  
       34 +/*
       35 + * Copyright (c) 2013, Joyent, Inc. All rights reserved.
       36 + */
       37 +
  34   38  #include <sys/param.h>
  35   39  #include <sys/types.h>
  36   40  #include <sys/systm.h>
  37   41  #include <sys/cred.h>
  38   42  #include <sys/time.h>
  39   43  #include <sys/vnode.h>
  40   44  #include <sys/vfs.h>
  41   45  #include <sys/vfs_opreg.h>
  42   46  #include <sys/file.h>
  43   47  #include <sys/filio.h>

  44   48  #include <sys/uio.h>
  45   49  #include <sys/buf.h>
  46   50  #include <sys/mman.h>
  47   51  #include <sys/pathname.h>
  48   52  #include <sys/dirent.h>
  49   53  #include <sys/debug.h>
  50   54  #include <sys/vmsystm.h>
  51   55  #include <sys/fcntl.h>
  52   56  #include <sys/flock.h>
  53   57  #include <sys/swap.h>
  54   58  #include <sys/errno.h>
  55   59  #include <sys/strsubr.h>
  56   60  #include <sys/sysmacros.h>
  57   61  #include <sys/kmem.h>
  58   62  #include <sys/cmn_err.h>
  59   63  #include <sys/pathconf.h>
  60   64  #include <sys/utsname.h>
  61   65  #include <sys/dnlc.h>
  62   66  #include <sys/acl.h>
  63   67  #include <sys/systeminfo.h>
  64   68  #include <sys/policy.h>
  65   69  #include <sys/sdt.h>
  66   70  #include <sys/list.h>
  67   71  #include <sys/stat.h>
  68   72  #include <sys/zone.h>
  69   73  
  70   74  #include <rpc/types.h>
  71   75  #include <rpc/auth.h>
  72   76  #include <rpc/clnt.h>
  73   77  
  74   78  #include <nfs/nfs.h>
  75   79  #include <nfs/nfs_clnt.h>
  76   80  #include <nfs/nfs_acl.h>
  77   81  #include <nfs/lm.h>
  78   82  #include <nfs/nfs4.h>
  79   83  #include <nfs/nfs4_kprot.h>
  80   84  #include <nfs/rnode4.h>
  81   85  #include <nfs/nfs4_clnt.h>
  82   86  
  83   87  #include <vm/hat.h>
  84   88  #include <vm/as.h>
  85   89  #include <vm/page.h>
  86   90  #include <vm/pvn.h>
  87   91  #include <vm/seg.h>
  88   92  #include <vm/seg_map.h>
  89   93  #include <vm/seg_kpm.h>
  90   94  #include <vm/seg_vn.h>
  91   95  
  92   96  #include <fs/fs_subr.h>
  93   97  
  94   98  #include <sys/ddi.h>
  95   99  #include <sys/int_fmtio.h>
  96  100  #include <sys/fs/autofs.h>
  97  101  
  98  102  typedef struct {
  99  103          nfs4_ga_res_t   *di_garp;
 100  104          cred_t          *di_cred;
 101  105          hrtime_t        di_time_call;
 102  106  } dirattr_info_t;
 103  107  
 104  108  typedef enum nfs4_acl_op {
 105  109          NFS4_ACL_GET,
 106  110          NFS4_ACL_SET
 107  111  } nfs4_acl_op_t;
 108  112  
 109  113  static struct lm_sysid *nfs4_find_sysid(mntinfo4_t *mi);
 110  114  
 111  115  static void     nfs4_update_dircaches(change_info4 *, vnode_t *, vnode_t *,
 112  116                          char *, dirattr_info_t *);
 113  117  
 114  118  static void     nfs4close_otw(rnode4_t *, cred_t *, nfs4_open_owner_t *,
 115  119                      nfs4_open_stream_t *, int *, int *, nfs4_close_type_t,
 116  120                      nfs4_error_t *, int *);
 117  121  static int      nfs4_rdwrlbn(vnode_t *, page_t *, u_offset_t, size_t, int,
 118  122                          cred_t *);
 119  123  static int      nfs4write(vnode_t *, caddr_t, u_offset_t, int, cred_t *,
 120  124                          stable_how4 *);
 121  125  static int      nfs4read(vnode_t *, caddr_t, offset_t, int, size_t *,
 122  126                          cred_t *, bool_t, struct uio *);
 123  127  static int      nfs4setattr(vnode_t *, struct vattr *, int, cred_t *,
 124  128                          vsecattr_t *);
 125  129  static int      nfs4openattr(vnode_t *, vnode_t **, int, cred_t *);
 126  130  static int      nfs4lookup(vnode_t *, char *, vnode_t **, cred_t *, int);
 127  131  static int      nfs4lookup_xattr(vnode_t *, char *, vnode_t **, int, cred_t *);
 128  132  static int      nfs4lookupvalidate_otw(vnode_t *, char *, vnode_t **, cred_t *);
 129  133  static int      nfs4lookupnew_otw(vnode_t *, char *, vnode_t **, cred_t *);
 130  134  static int      nfs4mknod(vnode_t *, char *, struct vattr *, enum vcexcl,
 131  135                          int, vnode_t **, cred_t *);
 132  136  static int      nfs4open_otw(vnode_t *, char *, struct vattr *, vnode_t **,
 133  137                          cred_t *, int, int, enum createmode4, int);
 134  138  static int      nfs4rename(vnode_t *, char *, vnode_t *, char *, cred_t *,
 135  139                          caller_context_t *);
 136  140  static int      nfs4rename_persistent_fh(vnode_t *, char *, vnode_t *,
 137  141                          vnode_t *, char *, cred_t *, nfsstat4 *);
 138  142  static int      nfs4rename_volatile_fh(vnode_t *, char *, vnode_t *,
 139  143                          vnode_t *, char *, cred_t *, nfsstat4 *);
 140  144  static int      do_nfs4readdir(vnode_t *, rddir4_cache *, cred_t *);
 141  145  static void     nfs4readdir(vnode_t *, rddir4_cache *, cred_t *);
 142  146  static int      nfs4_bio(struct buf *, stable_how4 *, cred_t *, bool_t);
 143  147  static int      nfs4_getapage(vnode_t *, u_offset_t, size_t, uint_t *,
 144  148                          page_t *[], size_t, struct seg *, caddr_t,
 145  149                          enum seg_rw, cred_t *);
 146  150  static void     nfs4_readahead(vnode_t *, u_offset_t, caddr_t, struct seg *,
 147  151                          cred_t *);
 148  152  static int      nfs4_sync_putapage(vnode_t *, page_t *, u_offset_t, size_t,
 149  153                          int, cred_t *);
 150  154  static int      nfs4_sync_pageio(vnode_t *, page_t *, u_offset_t, size_t,
 151  155                          int, cred_t *);
 152  156  static int      nfs4_commit(vnode_t *, offset4, count4, cred_t *);
 153  157  static void     nfs4_set_mod(vnode_t *);
 154  158  static void     nfs4_get_commit(vnode_t *);
 155  159  static void     nfs4_get_commit_range(vnode_t *, u_offset_t, size_t);
 156  160  static int      nfs4_putpage_commit(vnode_t *, offset_t, size_t, cred_t *);
 157  161  static int      nfs4_commit_vp(vnode_t *, u_offset_t, size_t, cred_t *, int);
 158  162  static int      nfs4_sync_commit(vnode_t *, page_t *, offset3, count3,
 159  163                          cred_t *);
 160  164  static void     do_nfs4_async_commit(vnode_t *, page_t *, offset3, count3,
 161  165                          cred_t *);
 162  166  static int      nfs4_update_attrcache(nfsstat4, nfs4_ga_res_t *,
 163  167                          hrtime_t, vnode_t *, cred_t *);
 164  168  static int      nfs4_open_non_reg_file(vnode_t **, int, cred_t *);
 165  169  static int      nfs4_safelock(vnode_t *, const struct flock64 *, cred_t *);
 166  170  static void     nfs4_register_lock_locally(vnode_t *, struct flock64 *, int,
 167  171                          u_offset_t);
 168  172  static int      nfs4_lockrelease(vnode_t *, int, offset_t, cred_t *);
 169  173  static int      nfs4_block_and_wait(clock_t *, rnode4_t *);
 170  174  static cred_t  *state_to_cred(nfs4_open_stream_t *);
 171  175  static void     denied_to_flk(LOCK4denied *, flock64_t *, LOCKT4args *);
 172  176  static pid_t    lo_to_pid(lock_owner4 *);
 173  177  static void     nfs4_reinstitute_local_lock_state(vnode_t *, flock64_t *,
 174  178                          cred_t *, nfs4_lock_owner_t *);
 175  179  static void     push_reinstate(vnode_t *, int, flock64_t *, cred_t *,
 176  180                          nfs4_lock_owner_t *);
 177  181  static int      open_and_get_osp(vnode_t *, cred_t *, nfs4_open_stream_t **);
 178  182  static void     nfs4_delmap_callback(struct as *, void *, uint_t);
 179  183  static void     nfs4_free_delmapcall(nfs4_delmapcall_t *);
 180  184  static nfs4_delmapcall_t        *nfs4_init_delmapcall();
 181  185  static int      nfs4_find_and_delete_delmapcall(rnode4_t *, int *);
 182  186  static int      nfs4_is_acl_mask_valid(uint_t, nfs4_acl_op_t);
 183  187  static int      nfs4_create_getsecattr_return(vsecattr_t *, vsecattr_t *,
 184  188                          uid_t, gid_t, int);
 185  189  
 186  190  /*
 187  191   * Routines that implement the setting of v4 args for the misc. ops
 188  192   */
 189  193  static void     nfs4args_lock_free(nfs_argop4 *);
 190  194  static void     nfs4args_lockt_free(nfs_argop4 *);
 191  195  static void     nfs4args_setattr(nfs_argop4 *, vattr_t *, vsecattr_t *,
 192  196                          int, rnode4_t *, cred_t *, bitmap4, int *,
 193  197                          nfs4_stateid_types_t *);
 194  198  static void     nfs4args_setattr_free(nfs_argop4 *);
 195  199  static int      nfs4args_verify(nfs_argop4 *, vattr_t *, enum nfs_opnum4,
 196  200                          bitmap4);
 197  201  static void     nfs4args_verify_free(nfs_argop4 *);
 198  202  static void     nfs4args_write(nfs_argop4 *, stable_how4, rnode4_t *, cred_t *,
 199  203                          WRITE4args **, nfs4_stateid_types_t *);
 200  204  
 201  205  /*
 202  206   * These are the vnode ops functions that implement the vnode interface to
 203  207   * the networked file system.  See more comments below at nfs4_vnodeops.
 204  208   */
 205  209  static int      nfs4_open(vnode_t **, int, cred_t *, caller_context_t *);
 206  210  static int      nfs4_close(vnode_t *, int, int, offset_t, cred_t *,
 207  211                          caller_context_t *);
 208  212  static int      nfs4_read(vnode_t *, struct uio *, int, cred_t *,
 209  213                          caller_context_t *);
 210  214  static int      nfs4_write(vnode_t *, struct uio *, int, cred_t *,
 211  215                          caller_context_t *);
 212  216  static int      nfs4_ioctl(vnode_t *, int, intptr_t, int, cred_t *, int *,
 213  217                          caller_context_t *);
 214  218  static int      nfs4_setattr(vnode_t *, struct vattr *, int, cred_t *,
 215  219                          caller_context_t *);
 216  220  static int      nfs4_access(vnode_t *, int, int, cred_t *, caller_context_t *);
 217  221  static int      nfs4_readlink(vnode_t *, struct uio *, cred_t *,
 218  222                          caller_context_t *);
 219  223  static int      nfs4_fsync(vnode_t *, int, cred_t *, caller_context_t *);
 220  224  static int      nfs4_create(vnode_t *, char *, struct vattr *, enum vcexcl,
 221  225                          int, vnode_t **, cred_t *, int, caller_context_t *,
 222  226                          vsecattr_t *);
 223  227  static int      nfs4_remove(vnode_t *, char *, cred_t *, caller_context_t *,
 224  228                          int);
 225  229  static int      nfs4_link(vnode_t *, vnode_t *, char *, cred_t *,
 226  230                          caller_context_t *, int);
 227  231  static int      nfs4_rename(vnode_t *, char *, vnode_t *, char *, cred_t *,
 228  232                          caller_context_t *, int);
 229  233  static int      nfs4_mkdir(vnode_t *, char *, struct vattr *, vnode_t **,
 230  234                          cred_t *, caller_context_t *, int, vsecattr_t *);
 231  235  static int      nfs4_rmdir(vnode_t *, char *, vnode_t *, cred_t *,
 232  236                          caller_context_t *, int);
 233  237  static int      nfs4_symlink(vnode_t *, char *, struct vattr *, char *,
 234  238                          cred_t *, caller_context_t *, int);
 235  239  static int      nfs4_readdir(vnode_t *, struct uio *, cred_t *, int *,
 236  240                          caller_context_t *, int);
 237  241  static int      nfs4_seek(vnode_t *, offset_t, offset_t *, caller_context_t *);
 238  242  static int      nfs4_getpage(vnode_t *, offset_t, size_t, uint_t *,
 239  243                          page_t *[], size_t, struct seg *, caddr_t,
 240  244                          enum seg_rw, cred_t *, caller_context_t *);
 241  245  static int      nfs4_putpage(vnode_t *, offset_t, size_t, int, cred_t *,
 242  246                          caller_context_t *);
 243  247  static int      nfs4_map(vnode_t *, offset_t, struct as *, caddr_t *, size_t,
 244  248                          uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *);
 245  249  static int      nfs4_addmap(vnode_t *, offset_t, struct as *, caddr_t, size_t,
 246  250                          uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *);
 247  251  static int      nfs4_cmp(vnode_t *, vnode_t *, caller_context_t *);
 248  252  static int      nfs4_frlock(vnode_t *, int, struct flock64 *, int, offset_t,
 249  253                          struct flk_callback *, cred_t *, caller_context_t *);
 250  254  static int      nfs4_space(vnode_t *, int, struct flock64 *, int, offset_t,
 251  255                          cred_t *, caller_context_t *);
 252  256  static int      nfs4_delmap(vnode_t *, offset_t, struct as *, caddr_t, size_t,
 253  257                          uint_t, uint_t, uint_t, cred_t *, caller_context_t *);
 254  258  static int      nfs4_pageio(vnode_t *, page_t *, u_offset_t, size_t, int,
 255  259                          cred_t *, caller_context_t *);
 256  260  static void     nfs4_dispose(vnode_t *, page_t *, int, int, cred_t *,
 257  261                          caller_context_t *);
 258  262  static int      nfs4_setsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
 259  263                          caller_context_t *);
 260  264  /*
 261  265   * These vnode ops are required to be called from outside this source file,
 262  266   * e.g. by ephemeral mount stub vnode ops, and so may not be declared
 263  267   * as static.
 264  268   */
 265  269  int     nfs4_getattr(vnode_t *, struct vattr *, int, cred_t *,
 266  270              caller_context_t *);
 267  271  void    nfs4_inactive(vnode_t *, cred_t *, caller_context_t *);
 268  272  int     nfs4_lookup(vnode_t *, char *, vnode_t **,
 269  273              struct pathname *, int, vnode_t *, cred_t *,
 270  274              caller_context_t *, int *, pathname_t *);
 271  275  int     nfs4_fid(vnode_t *, fid_t *, caller_context_t *);
 272  276  int     nfs4_rwlock(vnode_t *, int, caller_context_t *);
 273  277  void    nfs4_rwunlock(vnode_t *, int, caller_context_t *);
 274  278  int     nfs4_realvp(vnode_t *, vnode_t **, caller_context_t *);
 275  279  int     nfs4_pathconf(vnode_t *, int, ulong_t *, cred_t *,
 276  280              caller_context_t *);
 277  281  int     nfs4_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
 278  282              caller_context_t *);
 279  283  int     nfs4_shrlock(vnode_t *, int, struct shrlock *, int, cred_t *,
 280  284              caller_context_t *);
 281  285  
 282  286  /*
 283  287   * Used for nfs4_commit_vp() to indicate if we should
 284  288   * wait on pending writes.
 285  289   */
 286  290  #define NFS4_WRITE_NOWAIT       0
 287  291  #define NFS4_WRITE_WAIT         1
 288  292  
 289  293  #define NFS4_BASE_WAIT_TIME 1   /* 1 second */
 290  294  
 291  295  /*
 292  296   * Error flags used to pass information about certain special errors
 293  297   * which need to be handled specially.
 294  298   */
 295  299  #define NFS_EOF                 -98
 296  300  #define NFS_VERF_MISMATCH       -97
 297  301  
 298  302  /*
 299  303   * Flags used to differentiate between which operation drove the
 300  304   * potential CLOSE OTW. (see nfs4_close_otw_if_necessary)
 301  305   */
 302  306  #define NFS4_CLOSE_OP           0x1
 303  307  #define NFS4_DELMAP_OP          0x2
 304  308  #define NFS4_INACTIVE_OP        0x3
 305  309  
 306  310  #define ISVDEV(t) ((t == VBLK) || (t == VCHR) || (t == VFIFO))
 307  311  
 308  312  /* ALIGN64 aligns the given buffer and adjust buffer size to 64 bit */
 309  313  #define ALIGN64(x, ptr, sz)                                             \
 310  314          x = ((uintptr_t)(ptr)) & (sizeof (uint64_t) - 1);               \
 311  315          if (x) {                                                        \
 312  316                  x = sizeof (uint64_t) - (x);                            \
 313  317                  sz -= (x);                                              \
 314  318                  ptr += (x);                                             \
 315  319          }
 316  320  
 317  321  #ifdef DEBUG
 318  322  int nfs4_client_attr_debug = 0;
 319  323  int nfs4_client_state_debug = 0;
 320  324  int nfs4_client_shadow_debug = 0;
 321  325  int nfs4_client_lock_debug = 0;
 322  326  int nfs4_seqid_sync = 0;
 323  327  int nfs4_client_map_debug = 0;
 324  328  static int nfs4_pageio_debug = 0;
 325  329  int nfs4_client_inactive_debug = 0;
 326  330  int nfs4_client_recov_debug = 0;
 327  331  int nfs4_client_failover_debug = 0;
 328  332  int nfs4_client_call_debug = 0;
 329  333  int nfs4_client_lookup_debug = 0;
 330  334  int nfs4_client_zone_debug = 0;
 331  335  int nfs4_lost_rqst_debug = 0;
 332  336  int nfs4_rdattrerr_debug = 0;
 333  337  int nfs4_open_stream_debug = 0;
 334  338  
 335  339  int nfs4read_error_inject;
 336  340  
 337  341  static int nfs4_create_misses = 0;
 338  342  
 339  343  static int nfs4_readdir_cache_shorts = 0;
 340  344  static int nfs4_readdir_readahead = 0;
 341  345  
 342  346  static int nfs4_bio_do_stop = 0;
 343  347  
 344  348  static int nfs4_lostpage = 0;   /* number of times we lost original page */
 345  349  
 346  350  int nfs4_mmap_debug = 0;
 347  351  
 348  352  static int nfs4_pathconf_cache_hits = 0;
 349  353  static int nfs4_pathconf_cache_misses = 0;
 350  354  
 351  355  int nfs4close_all_cnt;
 352  356  int nfs4close_one_debug = 0;
 353  357  int nfs4close_notw_debug = 0;
 354  358  
 355  359  int denied_to_flk_debug = 0;
 356  360  void *lockt_denied_debug;
 357  361  
 358  362  #endif
 359  363  
 360  364  /*
 361  365   * How long to wait before trying again if OPEN_CONFIRM gets ETIMEDOUT
 362  366   * or NFS4ERR_RESOURCE.
 363  367   */
 364  368  static int confirm_retry_sec = 30;
 365  369  
 366  370  static int nfs4_lookup_neg_cache = 1;
 367  371  
 368  372  /*
 369  373   * number of pages to read ahead
 370  374   * optimized for 100 base-T.
 371  375   */
 372  376  static int nfs4_nra = 4;
 373  377  
 374  378  static int nfs4_do_symlink_cache = 1;
 375  379  
 376  380  static int nfs4_pathconf_disable_cache = 0;
 377  381  
 378  382  /*
 379  383   * These are the vnode ops routines which implement the vnode interface to
 380  384   * the networked file system.  These routines just take their parameters,
 381  385   * make them look networkish by putting the right info into interface structs,
 382  386   * and then calling the appropriate remote routine(s) to do the work.
 383  387   *
 384  388   * Note on directory name lookup cacheing:  If we detect a stale fhandle,
 385  389   * we purge the directory cache relative to that vnode.  This way, the
 386  390   * user won't get burned by the cache repeatedly.  See <nfs/rnode4.h> for
 387  391   * more details on rnode locking.
 388  392   */
 389  393  
 390  394  struct vnodeops *nfs4_vnodeops;
 391  395  
 392  396  const fs_operation_def_t nfs4_vnodeops_template[] = {
 393  397          VOPNAME_OPEN,           { .vop_open = nfs4_open },
 394  398          VOPNAME_CLOSE,          { .vop_close = nfs4_close },
 395  399          VOPNAME_READ,           { .vop_read = nfs4_read },
 396  400          VOPNAME_WRITE,          { .vop_write = nfs4_write },
 397  401          VOPNAME_IOCTL,          { .vop_ioctl = nfs4_ioctl },
 398  402          VOPNAME_GETATTR,        { .vop_getattr = nfs4_getattr },
 399  403          VOPNAME_SETATTR,        { .vop_setattr = nfs4_setattr },
 400  404          VOPNAME_ACCESS,         { .vop_access = nfs4_access },
 401  405          VOPNAME_LOOKUP,         { .vop_lookup = nfs4_lookup },
 402  406          VOPNAME_CREATE,         { .vop_create = nfs4_create },
 403  407          VOPNAME_REMOVE,         { .vop_remove = nfs4_remove },
 404  408          VOPNAME_LINK,           { .vop_link = nfs4_link },
 405  409          VOPNAME_RENAME,         { .vop_rename = nfs4_rename },
 406  410          VOPNAME_MKDIR,          { .vop_mkdir = nfs4_mkdir },
 407  411          VOPNAME_RMDIR,          { .vop_rmdir = nfs4_rmdir },
 408  412          VOPNAME_READDIR,        { .vop_readdir = nfs4_readdir },
 409  413          VOPNAME_SYMLINK,        { .vop_symlink = nfs4_symlink },
 410  414          VOPNAME_READLINK,       { .vop_readlink = nfs4_readlink },
 411  415          VOPNAME_FSYNC,          { .vop_fsync = nfs4_fsync },
 412  416          VOPNAME_INACTIVE,       { .vop_inactive = nfs4_inactive },
 413  417          VOPNAME_FID,            { .vop_fid = nfs4_fid },
 414  418          VOPNAME_RWLOCK,         { .vop_rwlock = nfs4_rwlock },
 415  419          VOPNAME_RWUNLOCK,       { .vop_rwunlock = nfs4_rwunlock },
 416  420          VOPNAME_SEEK,           { .vop_seek = nfs4_seek },
 417  421          VOPNAME_FRLOCK,         { .vop_frlock = nfs4_frlock },
 418  422          VOPNAME_SPACE,          { .vop_space = nfs4_space },
 419  423          VOPNAME_REALVP,         { .vop_realvp = nfs4_realvp },
 420  424          VOPNAME_GETPAGE,        { .vop_getpage = nfs4_getpage },
 421  425          VOPNAME_PUTPAGE,        { .vop_putpage = nfs4_putpage },
 422  426          VOPNAME_MAP,            { .vop_map = nfs4_map },
 423  427          VOPNAME_ADDMAP,         { .vop_addmap = nfs4_addmap },
 424  428          VOPNAME_DELMAP,         { .vop_delmap = nfs4_delmap },
 425  429          /* no separate nfs4_dump */
 426  430          VOPNAME_DUMP,           { .vop_dump = nfs_dump },
 427  431          VOPNAME_PATHCONF,       { .vop_pathconf = nfs4_pathconf },
 428  432          VOPNAME_PAGEIO,         { .vop_pageio = nfs4_pageio },
 429  433          VOPNAME_DISPOSE,        { .vop_dispose = nfs4_dispose },
 430  434          VOPNAME_SETSECATTR,     { .vop_setsecattr = nfs4_setsecattr },
 431  435          VOPNAME_GETSECATTR,     { .vop_getsecattr = nfs4_getsecattr },
 432  436          VOPNAME_SHRLOCK,        { .vop_shrlock = nfs4_shrlock },
 433  437          VOPNAME_VNEVENT,        { .vop_vnevent = fs_vnevent_support },
 434  438          NULL,                   NULL
 435  439  };
 436  440  
 437  441  /*
 438  442   * The following are subroutines and definitions to set args or get res
 439  443   * for the different nfsv4 ops
 440  444   */
 441  445  
 442  446  void
 443  447  nfs4args_lookup_free(nfs_argop4 *argop, int arglen)
 444  448  {
 445  449          int             i;
 446  450  
 447  451          for (i = 0; i < arglen; i++) {
 448  452                  if (argop[i].argop == OP_LOOKUP) {
 449  453                          kmem_free(
 450  454                              argop[i].nfs_argop4_u.oplookup.
 451  455                              objname.utf8string_val,
 452  456                              argop[i].nfs_argop4_u.oplookup.
 453  457                              objname.utf8string_len);
 454  458                  }
 455  459          }
 456  460  }
 457  461  
 458  462  static void
 459  463  nfs4args_lock_free(nfs_argop4 *argop)
 460  464  {
 461  465          locker4 *locker = &argop->nfs_argop4_u.oplock.locker;
 462  466  
 463  467          if (locker->new_lock_owner == TRUE) {
 464  468                  open_to_lock_owner4 *open_owner;
 465  469  
 466  470                  open_owner = &locker->locker4_u.open_owner;
 467  471                  if (open_owner->lock_owner.owner_val != NULL) {
 468  472                          kmem_free(open_owner->lock_owner.owner_val,
 469  473                              open_owner->lock_owner.owner_len);
 470  474                  }
 471  475          }
 472  476  }
 473  477  
 474  478  static void
 475  479  nfs4args_lockt_free(nfs_argop4 *argop)
 476  480  {
 477  481          lock_owner4 *lowner = &argop->nfs_argop4_u.oplockt.owner;
 478  482  
 479  483          if (lowner->owner_val != NULL) {
 480  484                  kmem_free(lowner->owner_val, lowner->owner_len);
 481  485          }
 482  486  }
 483  487  
 484  488  static void
 485  489  nfs4args_setattr(nfs_argop4 *argop, vattr_t *vap, vsecattr_t *vsap, int flags,
 486  490      rnode4_t *rp, cred_t *cr, bitmap4 supp, int *error,
 487  491      nfs4_stateid_types_t *sid_types)
 488  492  {
 489  493          fattr4          *attr = &argop->nfs_argop4_u.opsetattr.obj_attributes;
 490  494          mntinfo4_t      *mi;
 491  495  
 492  496          argop->argop = OP_SETATTR;
 493  497          /*
 494  498           * The stateid is set to 0 if client is not modifying the size
 495  499           * and otherwise to whatever nfs4_get_stateid() returns.
 496  500           *
 497  501           * XXX Note: nfs4_get_stateid() returns 0 if no lockowner and/or no
 498  502           * state struct could be found for the process/file pair.  We may
 499  503           * want to change this in the future (by OPENing the file).  See
 500  504           * bug # 4474852.
 501  505           */
 502  506          if (vap->va_mask & AT_SIZE) {
 503  507  
 504  508                  ASSERT(rp != NULL);
 505  509                  mi = VTOMI4(RTOV4(rp));
 506  510  
 507  511                  argop->nfs_argop4_u.opsetattr.stateid =
 508  512                      nfs4_get_stateid(cr, rp, curproc->p_pidp->pid_id, mi,
 509  513                      OP_SETATTR, sid_types, FALSE);
 510  514          } else {
 511  515                  bzero(&argop->nfs_argop4_u.opsetattr.stateid,
 512  516                      sizeof (stateid4));
 513  517          }
 514  518  
 515  519          *error = vattr_to_fattr4(vap, vsap, attr, flags, OP_SETATTR, supp);
 516  520          if (*error)
 517  521                  bzero(attr, sizeof (*attr));
 518  522  }
 519  523  
 520  524  static void
 521  525  nfs4args_setattr_free(nfs_argop4 *argop)
 522  526  {
 523  527          nfs4_fattr4_free(&argop->nfs_argop4_u.opsetattr.obj_attributes);
 524  528  }
 525  529  
 526  530  static int
 527  531  nfs4args_verify(nfs_argop4 *argop, vattr_t *vap, enum nfs_opnum4 op,
 528  532      bitmap4 supp)
 529  533  {
 530  534          fattr4 *attr;
 531  535          int error = 0;
 532  536  
 533  537          argop->argop = op;
 534  538          switch (op) {
 535  539          case OP_VERIFY:
 536  540                  attr = &argop->nfs_argop4_u.opverify.obj_attributes;
 537  541                  break;
 538  542          case OP_NVERIFY:
 539  543                  attr = &argop->nfs_argop4_u.opnverify.obj_attributes;
 540  544                  break;
 541  545          default:
 542  546                  return (EINVAL);
 543  547          }
 544  548          if (!error)
 545  549                  error = vattr_to_fattr4(vap, NULL, attr, 0, op, supp);
 546  550          if (error)
 547  551                  bzero(attr, sizeof (*attr));
 548  552          return (error);
 549  553  }
 550  554  
 551  555  static void
 552  556  nfs4args_verify_free(nfs_argop4 *argop)
 553  557  {
 554  558          switch (argop->argop) {
 555  559          case OP_VERIFY:
 556  560                  nfs4_fattr4_free(&argop->nfs_argop4_u.opverify.obj_attributes);
 557  561                  break;
 558  562          case OP_NVERIFY:
 559  563                  nfs4_fattr4_free(&argop->nfs_argop4_u.opnverify.obj_attributes);
 560  564                  break;
 561  565          default:
 562  566                  break;
 563  567          }
 564  568  }
 565  569  
 566  570  static void
 567  571  nfs4args_write(nfs_argop4 *argop, stable_how4 stable, rnode4_t *rp, cred_t *cr,
 568  572      WRITE4args **wargs_pp, nfs4_stateid_types_t *sid_tp)
 569  573  {
 570  574          WRITE4args *wargs = &argop->nfs_argop4_u.opwrite;
 571  575          mntinfo4_t *mi = VTOMI4(RTOV4(rp));
 572  576  
 573  577          argop->argop = OP_WRITE;
 574  578          wargs->stable = stable;
 575  579          wargs->stateid = nfs4_get_w_stateid(cr, rp, curproc->p_pidp->pid_id,
 576  580              mi, OP_WRITE, sid_tp);
 577  581          wargs->mblk = NULL;
 578  582          *wargs_pp = wargs;
 579  583  }
 580  584  
 581  585  void
 582  586  nfs4args_copen_free(OPEN4cargs *open_args)
 583  587  {
 584  588          if (open_args->owner.owner_val) {
 585  589                  kmem_free(open_args->owner.owner_val,
 586  590                      open_args->owner.owner_len);
 587  591          }
 588  592          if ((open_args->opentype == OPEN4_CREATE) &&
 589  593              (open_args->mode != EXCLUSIVE4)) {
 590  594                  nfs4_fattr4_free(&open_args->createhow4_u.createattrs);
 591  595          }
 592  596  }
 593  597  
 594  598  /*
 595  599   * XXX:  This is referenced in modstubs.s
 596  600   */
 597  601  struct vnodeops *
 598  602  nfs4_getvnodeops(void)
 599  603  {
 600  604          return (nfs4_vnodeops);
 601  605  }
 602  606  
 603  607  /*
 604  608   * The OPEN operation opens a regular file.
 605  609   */
 606  610  /*ARGSUSED3*/
 607  611  static int
 608  612  nfs4_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
 609  613  {
 610  614          vnode_t *dvp = NULL;
 611  615          rnode4_t *rp, *drp;
 612  616          int error;
 613  617          int just_been_created;
 614  618          char fn[MAXNAMELEN];
 615  619  
 616  620          NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4_open: "));
 617  621          if (nfs_zone() != VTOMI4(*vpp)->mi_zone)
 618  622                  return (EIO);
 619  623          rp = VTOR4(*vpp);
 620  624  
 621  625          /*
 622  626           * Check to see if opening something besides a regular file;
 623  627           * if so skip the OTW call
 624  628           */
 625  629          if ((*vpp)->v_type != VREG) {
 626  630                  error = nfs4_open_non_reg_file(vpp, flag, cr);
 627  631                  return (error);
 628  632          }
 629  633  
 630  634          /*
 631  635           * XXX - would like a check right here to know if the file is
 632  636           * executable or not, so as to skip OTW
 633  637           */
 634  638  
 635  639          if ((error = vtodv(*vpp, &dvp, cr, TRUE)) != 0)
 636  640                  return (error);
 637  641  
 638  642          drp = VTOR4(dvp);
 639  643          if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp)))
 640  644                  return (EINTR);
 641  645  
 642  646          if ((error = vtoname(*vpp, fn, MAXNAMELEN)) != 0) {
 643  647                  nfs_rw_exit(&drp->r_rwlock);
 644  648                  return (error);
 645  649          }
 646  650  
 647  651          /*
 648  652           * See if this file has just been CREATEd.
 649  653           * If so, clear the flag and update the dnlc, which was previously
 650  654           * skipped in nfs4_create.
 651  655           * XXX need better serilization on this.
 652  656           * XXX move this into the nf4open_otw call, after we have
 653  657           * XXX acquired the open owner seqid sync.
 654  658           */
 655  659          mutex_enter(&rp->r_statev4_lock);
 656  660          if (rp->created_v4) {
 657  661                  rp->created_v4 = 0;
 658  662                  mutex_exit(&rp->r_statev4_lock);
 659  663  
 660  664                  dnlc_update(dvp, fn, *vpp);
 661  665                  /* This is needed so we don't bump the open ref count */
 662  666                  just_been_created = 1;
 663  667          } else {
 664  668                  mutex_exit(&rp->r_statev4_lock);
 665  669                  just_been_created = 0;
 666  670          }
 667  671  
 668  672          /*
 669  673           * If caller specified O_TRUNC/FTRUNC, then be sure to set
 670  674           * FWRITE (to drive successful setattr(size=0) after open)
 671  675           */
 672  676          if (flag & FTRUNC)
 673  677                  flag |= FWRITE;
 674  678  
 675  679          error = nfs4open_otw(dvp, fn, NULL, vpp, cr, 0, flag, 0,
 676  680              just_been_created);
 677  681  
 678  682          if (!error && !((*vpp)->v_flag & VROOT))
 679  683                  dnlc_update(dvp, fn, *vpp);
 680  684  
 681  685          nfs_rw_exit(&drp->r_rwlock);
 682  686  
 683  687          /* release the hold from vtodv */
 684  688          VN_RELE(dvp);
 685  689  
 686  690          /* exchange the shadow for the master vnode, if needed */
 687  691  
 688  692          if (error == 0 && IS_SHADOW(*vpp, rp))
 689  693                  sv_exchange(vpp);
 690  694  
 691  695          return (error);
 692  696  }
 693  697  
 694  698  /*
 695  699   * See if there's a "lost open" request to be saved and recovered.
 696  700   */
 697  701  static void
 698  702  nfs4open_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp,
 699  703      nfs4_open_owner_t *oop, cred_t *cr, vnode_t *vp,
 700  704      vnode_t *dvp, OPEN4cargs *open_args)
 701  705  {
 702  706          vfs_t *vfsp;
 703  707          char *srccfp;
 704  708  
 705  709          vfsp = (dvp ? dvp->v_vfsp : vp->v_vfsp);
 706  710  
 707  711          if (error != ETIMEDOUT && error != EINTR &&
 708  712              !NFS4_FRC_UNMT_ERR(error, vfsp)) {
 709  713                  lost_rqstp->lr_op = 0;
 710  714                  return;
 711  715          }
 712  716  
 713  717          NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
 714  718              "nfs4open_save_lost_rqst: error %d", error));
 715  719  
 716  720          lost_rqstp->lr_op = OP_OPEN;
 717  721  
 718  722          /*
 719  723           * The vp (if it is not NULL) and dvp are held and rele'd via
 720  724           * the recovery code.  See nfs4_save_lost_rqst.
 721  725           */
 722  726          lost_rqstp->lr_vp = vp;
 723  727          lost_rqstp->lr_dvp = dvp;
 724  728          lost_rqstp->lr_oop = oop;
 725  729          lost_rqstp->lr_osp = NULL;
 726  730          lost_rqstp->lr_lop = NULL;
 727  731          lost_rqstp->lr_cr = cr;
 728  732          lost_rqstp->lr_flk = NULL;
 729  733          lost_rqstp->lr_oacc = open_args->share_access;
 730  734          lost_rqstp->lr_odeny = open_args->share_deny;
 731  735          lost_rqstp->lr_oclaim = open_args->claim;
 732  736          if (open_args->claim == CLAIM_DELEGATE_CUR) {
 733  737                  lost_rqstp->lr_ostateid =
 734  738                      open_args->open_claim4_u.delegate_cur_info.delegate_stateid;
 735  739                  srccfp = open_args->open_claim4_u.delegate_cur_info.cfile;
 736  740          } else {
 737  741                  srccfp = open_args->open_claim4_u.cfile;
 738  742          }
 739  743          lost_rqstp->lr_ofile.utf8string_len = 0;
 740  744          lost_rqstp->lr_ofile.utf8string_val = NULL;
 741  745          (void) str_to_utf8(srccfp, &lost_rqstp->lr_ofile);
 742  746          lost_rqstp->lr_putfirst = FALSE;
 743  747  }
 744  748  
 745  749  struct nfs4_excl_time {
 746  750          uint32 seconds;
 747  751          uint32 nseconds;
 748  752  };
 749  753  
 750  754  /*
 751  755   * The OPEN operation creates and/or opens a regular file
 752  756   *
 753  757   * ARGSUSED
 754  758   */
 755  759  static int
 756  760  nfs4open_otw(vnode_t *dvp, char *file_name, struct vattr *in_va,
 757  761      vnode_t **vpp, cred_t *cr, int create_flag, int open_flag,
 758  762      enum createmode4 createmode, int file_just_been_created)
 759  763  {
 760  764          rnode4_t *rp;
 761  765          rnode4_t *drp = VTOR4(dvp);
 762  766          vnode_t *vp = NULL;
 763  767          vnode_t *vpi = *vpp;
 764  768          bool_t needrecov = FALSE;
 765  769  
 766  770          int doqueue = 1;
 767  771  
 768  772          COMPOUND4args_clnt args;
 769  773          COMPOUND4res_clnt res;
 770  774          nfs_argop4 *argop;
 771  775          nfs_resop4 *resop;
 772  776          int argoplist_size;
 773  777          int idx_open, idx_fattr;
 774  778  
 775  779          GETFH4res *gf_res = NULL;
 776  780          OPEN4res *op_res = NULL;
 777  781          nfs4_ga_res_t *garp;
 778  782          fattr4 *attr = NULL;
 779  783          struct nfs4_excl_time verf;
 780  784          bool_t did_excl_setup = FALSE;
 781  785          int created_osp;
 782  786  
 783  787          OPEN4cargs *open_args;
 784  788          nfs4_open_owner_t       *oop = NULL;
 785  789          nfs4_open_stream_t      *osp = NULL;
 786  790          seqid4 seqid = 0;
 787  791          bool_t retry_open = FALSE;
 788  792          nfs4_recov_state_t recov_state;
 789  793          nfs4_lost_rqst_t lost_rqst;
 790  794          nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
 791  795          hrtime_t t;
 792  796          int acc = 0;
 793  797          cred_t *cred_otw = NULL;        /* cred used to do the RPC call */
 794  798          cred_t *ncr = NULL;
 795  799  
 796  800          nfs4_sharedfh_t *otw_sfh;
 797  801          nfs4_sharedfh_t *orig_sfh;
 798  802          int fh_differs = 0;
 799  803          int numops, setgid_flag;
 800  804          int num_bseqid_retry = NFS4_NUM_RETRY_BAD_SEQID + 1;
 801  805  
 802  806          /*
 803  807           * Make sure we properly deal with setting the right gid on
 804  808           * a newly created file to reflect the parent's setgid bit
 805  809           */
 806  810          setgid_flag = 0;
 807  811          if (create_flag && in_va) {
 808  812  
 809  813                  /*
 810  814                   * If there is grpid mount flag used or
 811  815                   * the parent's directory has the setgid bit set
 812  816                   * _and_ the client was able to get a valid mapping
 813  817                   * for the parent dir's owner_group, we want to
 814  818                   * append NVERIFY(owner_group == dva.va_gid) and
 815  819                   * SETATTR to the CREATE compound.
 816  820                   */
 817  821                  mutex_enter(&drp->r_statelock);
 818  822                  if ((VTOMI4(dvp)->mi_flags & MI4_GRPID ||
 819  823                      drp->r_attr.va_mode & VSGID) &&
 820  824                      drp->r_attr.va_gid != GID_NOBODY) {
 821  825                          in_va->va_mask |= AT_GID;
 822  826                          in_va->va_gid = drp->r_attr.va_gid;
 823  827                          setgid_flag = 1;
 824  828                  }
 825  829                  mutex_exit(&drp->r_statelock);
 826  830          }
 827  831  
 828  832          /*
 829  833           * Normal/non-create compound:
 830  834           * PUTFH(dfh) + OPEN(create) + GETFH + GETATTR(new)
 831  835           *
 832  836           * Open(create) compound no setgid:
 833  837           * PUTFH(dfh) + SAVEFH + OPEN(create) + GETFH + GETATTR(new) +
 834  838           * RESTOREFH + GETATTR
 835  839           *
 836  840           * Open(create) setgid:
 837  841           * PUTFH(dfh) + OPEN(create) + GETFH + GETATTR(new) +
 838  842           * SAVEFH + PUTFH(dfh) + GETATTR(dvp) + RESTOREFH +
 839  843           * NVERIFY(grp) + SETATTR
 840  844           */
 841  845          if (setgid_flag) {
 842  846                  numops = 10;
 843  847                  idx_open = 1;
 844  848                  idx_fattr = 3;
 845  849          } else if (create_flag) {
 846  850                  numops = 7;
 847  851                  idx_open = 2;
 848  852                  idx_fattr = 4;
 849  853          } else {
 850  854                  numops = 4;
 851  855                  idx_open = 1;
 852  856                  idx_fattr = 3;
 853  857          }
 854  858  
 855  859          args.array_len = numops;
 856  860          argoplist_size = numops * sizeof (nfs_argop4);
 857  861          argop = kmem_alloc(argoplist_size, KM_SLEEP);
 858  862  
 859  863          NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4open_otw: "
 860  864              "open %s open flag 0x%x cred %p", file_name, open_flag,
 861  865              (void *)cr));
 862  866  
 863  867          ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
 864  868          if (create_flag) {
 865  869                  /*
 866  870                   * We are to create a file.  Initialize the passed in vnode
 867  871                   * pointer.
 868  872                   */
 869  873                  vpi = NULL;
 870  874          } else {
 871  875                  /*
 872  876                   * Check to see if the client owns a read delegation and is
 873  877                   * trying to open for write.  If so, then return the delegation
 874  878                   * to avoid the server doing a cb_recall and returning DELAY.
 875  879                   * NB - we don't use the statev4_lock here because we'd have
 876  880                   * to drop the lock anyway and the result would be stale.
 877  881                   */
 878  882                  if ((open_flag & FWRITE) &&
 879  883                      VTOR4(vpi)->r_deleg_type == OPEN_DELEGATE_READ)
 880  884                          (void) nfs4delegreturn(VTOR4(vpi), NFS4_DR_REOPEN);
 881  885  
 882  886                  /*
 883  887                   * If the file has a delegation, then do an access check up
 884  888                   * front.  This avoids having to an access check later after
 885  889                   * we've already done start_op, which could deadlock.
 886  890                   */
 887  891                  if (VTOR4(vpi)->r_deleg_type != OPEN_DELEGATE_NONE) {
 888  892                          if (open_flag & FREAD &&
 889  893                              nfs4_access(vpi, VREAD, 0, cr, NULL) == 0)
 890  894                                  acc |= VREAD;
 891  895                          if (open_flag & FWRITE &&
 892  896                              nfs4_access(vpi, VWRITE, 0, cr, NULL) == 0)
 893  897                                  acc |= VWRITE;
 894  898                  }
 895  899          }
 896  900  
 897  901          drp = VTOR4(dvp);
 898  902  
 899  903          recov_state.rs_flags = 0;
 900  904          recov_state.rs_num_retry_despite_err = 0;
 901  905          cred_otw = cr;
 902  906  
 903  907  recov_retry:
 904  908          fh_differs = 0;
 905  909          nfs4_error_zinit(&e);
 906  910  
 907  911          e.error = nfs4_start_op(VTOMI4(dvp), dvp, vpi, &recov_state);
 908  912          if (e.error) {
 909  913                  if (ncr != NULL)
 910  914                          crfree(ncr);
 911  915                  kmem_free(argop, argoplist_size);
 912  916                  return (e.error);
 913  917          }
 914  918  
 915  919          args.ctag = TAG_OPEN;
 916  920          args.array_len = numops;
 917  921          args.array = argop;
 918  922  
 919  923          /* putfh directory fh */
 920  924          argop[0].argop = OP_CPUTFH;
 921  925          argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
 922  926  
 923  927          /* OPEN: either op 1 or op 2 depending upon create/setgid flags */
 924  928          argop[idx_open].argop = OP_COPEN;
 925  929          open_args = &argop[idx_open].nfs_argop4_u.opcopen;
 926  930          open_args->claim = CLAIM_NULL;
 927  931  
 928  932          /* name of file */
 929  933          open_args->open_claim4_u.cfile = file_name;
 930  934          open_args->owner.owner_len = 0;
 931  935          open_args->owner.owner_val = NULL;
 932  936  
 933  937          if (create_flag) {
 934  938                  /* CREATE a file */
 935  939                  open_args->opentype = OPEN4_CREATE;
 936  940                  open_args->mode = createmode;
 937  941                  if (createmode == EXCLUSIVE4) {
 938  942                          if (did_excl_setup == FALSE) {
 939  943                                  verf.seconds = zone_get_hostid(NULL);
 940  944                                  if (verf.seconds != 0)
 941  945                                          verf.nseconds = newnum();
 942  946                                  else {
 943  947                                          timestruc_t now;
 944  948  
 945  949                                          gethrestime(&now);
 946  950                                          verf.seconds = now.tv_sec;
 947  951                                          verf.nseconds = now.tv_nsec;
 948  952                                  }
 949  953                                  /*
 950  954                                   * Since the server will use this value for the
 951  955                                   * mtime, make sure that it can't overflow. Zero
 952  956                                   * out the MSB. The actual value does not matter
 953  957                                   * here, only its uniqeness.
 954  958                                   */
 955  959                                  verf.seconds &= INT32_MAX;
 956  960                                  did_excl_setup = TRUE;
 957  961                          }
 958  962  
 959  963                          /* Now copy over verifier to OPEN4args. */
 960  964                          open_args->createhow4_u.createverf = *(uint64_t *)&verf;
 961  965                  } else {
 962  966                          int v_error;
 963  967                          bitmap4 supp_attrs;
 964  968                          servinfo4_t *svp;
 965  969  
 966  970                          attr = &open_args->createhow4_u.createattrs;
 967  971  
 968  972                          svp = drp->r_server;
 969  973                          (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
 970  974                          supp_attrs = svp->sv_supp_attrs;
 971  975                          nfs_rw_exit(&svp->sv_lock);
 972  976  
 973  977                          /* GUARDED4 or UNCHECKED4 */
 974  978                          v_error = vattr_to_fattr4(in_va, NULL, attr, 0, OP_OPEN,
 975  979                              supp_attrs);
 976  980                          if (v_error) {
 977  981                                  bzero(attr, sizeof (*attr));
 978  982                                  nfs4args_copen_free(open_args);
 979  983                                  nfs4_end_op(VTOMI4(dvp), dvp, vpi,
 980  984                                      &recov_state, FALSE);
 981  985                                  if (ncr != NULL)
 982  986                                          crfree(ncr);
 983  987                                  kmem_free(argop, argoplist_size);
 984  988                                  return (v_error);
 985  989                          }
 986  990                  }
 987  991          } else {
 988  992                  /* NO CREATE */
 989  993                  open_args->opentype = OPEN4_NOCREATE;
 990  994          }
 991  995  
 992  996          if (recov_state.rs_sp != NULL) {
 993  997                  mutex_enter(&recov_state.rs_sp->s_lock);
 994  998                  open_args->owner.clientid = recov_state.rs_sp->clientid;
 995  999                  mutex_exit(&recov_state.rs_sp->s_lock);
 996 1000          } else {
 997 1001                  /* XXX should we just fail here? */
 998 1002                  open_args->owner.clientid = 0;
 999 1003          }
1000 1004  
1001 1005          /*
1002 1006           * This increments oop's ref count or creates a temporary 'just_created'
1003 1007           * open owner that will become valid when this OPEN/OPEN_CONFIRM call
1004 1008           * completes.
1005 1009           */
1006 1010          mutex_enter(&VTOMI4(dvp)->mi_lock);
1007 1011  
1008 1012          /* See if a permanent or just created open owner exists */
1009 1013          oop = find_open_owner_nolock(cr, NFS4_JUST_CREATED, VTOMI4(dvp));
1010 1014          if (!oop) {
1011 1015                  /*
1012 1016                   * This open owner does not exist so create a temporary
1013 1017                   * just created one.
1014 1018                   */
1015 1019                  oop = create_open_owner(cr, VTOMI4(dvp));
1016 1020                  ASSERT(oop != NULL);
1017 1021          }
1018 1022          mutex_exit(&VTOMI4(dvp)->mi_lock);
1019 1023  
1020 1024          /* this length never changes, do alloc before seqid sync */
1021 1025          open_args->owner.owner_len = sizeof (oop->oo_name);
1022 1026          open_args->owner.owner_val =
1023 1027              kmem_alloc(open_args->owner.owner_len, KM_SLEEP);
1024 1028  
1025 1029          e.error = nfs4_start_open_seqid_sync(oop, VTOMI4(dvp));
1026 1030          if (e.error == EAGAIN) {
1027 1031                  open_owner_rele(oop);
1028 1032                  nfs4args_copen_free(open_args);
1029 1033                  nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, TRUE);
1030 1034                  if (ncr != NULL) {
1031 1035                          crfree(ncr);
1032 1036                          ncr = NULL;
1033 1037                  }
1034 1038                  goto recov_retry;
1035 1039          }
1036 1040  
1037 1041          /* Check to see if we need to do the OTW call */
1038 1042          if (!create_flag) {
1039 1043                  if (!nfs4_is_otw_open_necessary(oop, open_flag, vpi,
1040 1044                      file_just_been_created, &e.error, acc, &recov_state)) {
1041 1045  
1042 1046                          /*
1043 1047                           * The OTW open is not necessary.  Either
1044 1048                           * the open can succeed without it (eg.
1045 1049                           * delegation, error == 0) or the open
1046 1050                           * must fail due to an access failure
1047 1051                           * (error != 0).  In either case, tidy
1048 1052                           * up and return.
1049 1053                           */
1050 1054  
1051 1055                          nfs4_end_open_seqid_sync(oop);
1052 1056                          open_owner_rele(oop);
1053 1057                          nfs4args_copen_free(open_args);
1054 1058                          nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, FALSE);
1055 1059                          if (ncr != NULL)
1056 1060                                  crfree(ncr);
1057 1061                          kmem_free(argop, argoplist_size);
1058 1062                          return (e.error);
1059 1063                  }
1060 1064          }
1061 1065  
1062 1066          bcopy(&oop->oo_name, open_args->owner.owner_val,
1063 1067              open_args->owner.owner_len);
1064 1068  
1065 1069          seqid = nfs4_get_open_seqid(oop) + 1;
1066 1070          open_args->seqid = seqid;
1067 1071          open_args->share_access = 0;
1068 1072          if (open_flag & FREAD)
1069 1073                  open_args->share_access |= OPEN4_SHARE_ACCESS_READ;
1070 1074          if (open_flag & FWRITE)
1071 1075                  open_args->share_access |= OPEN4_SHARE_ACCESS_WRITE;
1072 1076          open_args->share_deny = OPEN4_SHARE_DENY_NONE;
1073 1077  
1074 1078  
1075 1079  
1076 1080          /*
1077 1081           * getfh w/sanity check for idx_open/idx_fattr
1078 1082           */
1079 1083          ASSERT((idx_open + 1) == (idx_fattr - 1));
1080 1084          argop[idx_open + 1].argop = OP_GETFH;
1081 1085  
1082 1086          /* getattr */
1083 1087          argop[idx_fattr].argop = OP_GETATTR;
1084 1088          argop[idx_fattr].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
1085 1089          argop[idx_fattr].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
1086 1090  
1087 1091          if (setgid_flag) {
1088 1092                  vattr_t _v;
1089 1093                  servinfo4_t *svp;
1090 1094                  bitmap4 supp_attrs;
1091 1095  
1092 1096                  svp = drp->r_server;
1093 1097                  (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1094 1098                  supp_attrs = svp->sv_supp_attrs;
1095 1099                  nfs_rw_exit(&svp->sv_lock);
1096 1100  
1097 1101                  /*
1098 1102                   * For setgid case, we need to:
1099 1103                   * 4:savefh(new) 5:putfh(dir) 6:getattr(dir) 7:restorefh(new)
1100 1104                   */
1101 1105                  argop[4].argop = OP_SAVEFH;
1102 1106  
1103 1107                  argop[5].argop = OP_CPUTFH;
1104 1108                  argop[5].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
1105 1109  
1106 1110                  argop[6].argop = OP_GETATTR;
1107 1111                  argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
1108 1112                  argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
1109 1113  
1110 1114                  argop[7].argop = OP_RESTOREFH;
1111 1115  
1112 1116                  /*
1113 1117                   * nverify
1114 1118                   */
1115 1119                  _v.va_mask = AT_GID;
1116 1120                  _v.va_gid = in_va->va_gid;
1117 1121                  if (!(e.error = nfs4args_verify(&argop[8], &_v, OP_NVERIFY,
1118 1122                      supp_attrs))) {
1119 1123  
1120 1124                          /*
1121 1125                           * setattr
1122 1126                           *
1123 1127                           * We _know_ we're not messing with AT_SIZE or
1124 1128                           * AT_XTIME, so no need for stateid or flags.
1125 1129                           * Also we specify NULL rp since we're only
1126 1130                           * interested in setting owner_group attributes.
1127 1131                           */
1128 1132                          nfs4args_setattr(&argop[9], &_v, NULL, 0, NULL, cr,
1129 1133                              supp_attrs, &e.error, 0);
1130 1134                          if (e.error)
1131 1135                                  nfs4args_verify_free(&argop[8]);
1132 1136                  }
1133 1137  
1134 1138                  if (e.error) {
1135 1139                          /*
1136 1140                           * XXX - Revisit the last argument to nfs4_end_op()
1137 1141                           *       once 5020486 is fixed.
1138 1142                           */
1139 1143                          nfs4_end_open_seqid_sync(oop);
1140 1144                          open_owner_rele(oop);
1141 1145                          nfs4args_copen_free(open_args);
1142 1146                          nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, TRUE);
1143 1147                          if (ncr != NULL)
1144 1148                                  crfree(ncr);
1145 1149                          kmem_free(argop, argoplist_size);
1146 1150                          return (e.error);
1147 1151                  }
1148 1152          } else if (create_flag) {
1149 1153                  argop[1].argop = OP_SAVEFH;
1150 1154  
1151 1155                  argop[5].argop = OP_RESTOREFH;
1152 1156  
1153 1157                  argop[6].argop = OP_GETATTR;
1154 1158                  argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
1155 1159                  argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
1156 1160          }
1157 1161  
1158 1162          NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
1159 1163              "nfs4open_otw: %s call, nm %s, rp %s",
1160 1164              needrecov ? "recov" : "first", file_name,
1161 1165              rnode4info(VTOR4(dvp))));
1162 1166  
1163 1167          t = gethrtime();
1164 1168  
1165 1169          rfs4call(VTOMI4(dvp), &args, &res, cred_otw, &doqueue, 0, &e);
1166 1170  
1167 1171          if (!e.error && nfs4_need_to_bump_seqid(&res))
1168 1172                  nfs4_set_open_seqid(seqid, oop, args.ctag);
1169 1173  
1170 1174          needrecov = nfs4_needs_recovery(&e, TRUE, dvp->v_vfsp);
1171 1175  
1172 1176          if (e.error || needrecov) {
1173 1177                  bool_t abort = FALSE;
1174 1178  
1175 1179                  if (needrecov) {
1176 1180                          nfs4_bseqid_entry_t *bsep = NULL;
1177 1181  
1178 1182                          nfs4open_save_lost_rqst(e.error, &lost_rqst, oop,
1179 1183                              cred_otw, vpi, dvp, open_args);
1180 1184  
1181 1185                          if (!e.error && res.status == NFS4ERR_BAD_SEQID) {
1182 1186                                  bsep = nfs4_create_bseqid_entry(oop, NULL,
1183 1187                                      vpi, 0, args.ctag, open_args->seqid);
1184 1188                                  num_bseqid_retry--;
1185 1189                          }
1186 1190  
1187 1191                          abort = nfs4_start_recovery(&e, VTOMI4(dvp), dvp, vpi,
1188 1192                              NULL, lost_rqst.lr_op == OP_OPEN ?
1189 1193                              &lost_rqst : NULL, OP_OPEN, bsep, NULL, NULL);
1190 1194  
1191 1195                          if (bsep)
1192 1196                                  kmem_free(bsep, sizeof (*bsep));
1193 1197                          /* give up if we keep getting BAD_SEQID */
1194 1198                          if (num_bseqid_retry == 0)
1195 1199                                  abort = TRUE;
1196 1200                          if (abort == TRUE && e.error == 0)
1197 1201                                  e.error = geterrno4(res.status);
1198 1202                  }
1199 1203                  nfs4_end_open_seqid_sync(oop);
1200 1204                  open_owner_rele(oop);
1201 1205                  nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov);
1202 1206                  nfs4args_copen_free(open_args);
1203 1207                  if (setgid_flag) {
1204 1208                          nfs4args_verify_free(&argop[8]);
1205 1209                          nfs4args_setattr_free(&argop[9]);
1206 1210                  }
1207 1211                  if (!e.error)
1208 1212                          (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1209 1213                  if (ncr != NULL) {
1210 1214                          crfree(ncr);
1211 1215                          ncr = NULL;
1212 1216                  }
1213 1217                  if (!needrecov || abort == TRUE || e.error == EINTR ||
1214 1218                      NFS4_FRC_UNMT_ERR(e.error, dvp->v_vfsp)) {
1215 1219                          kmem_free(argop, argoplist_size);
1216 1220                          return (e.error);
1217 1221                  }
1218 1222                  goto recov_retry;
1219 1223          }
1220 1224  
1221 1225          /*
1222 1226           * Will check and update lease after checking the rflag for
1223 1227           * OPEN_CONFIRM in the successful OPEN call.
1224 1228           */
1225 1229          if (res.status != NFS4_OK && res.array_len <= idx_fattr + 1) {
1226 1230  
1227 1231                  /*
1228 1232                   * XXX what if we're crossing mount points from server1:/drp
1229 1233                   * to server2:/drp/rp.
1230 1234                   */
1231 1235  
1232 1236                  /* Signal our end of use of the open seqid */
1233 1237                  nfs4_end_open_seqid_sync(oop);
1234 1238  
1235 1239                  /*
1236 1240                   * This will destroy the open owner if it was just created,
1237 1241                   * and no one else has put a reference on it.
1238 1242                   */
1239 1243                  open_owner_rele(oop);
1240 1244                  if (create_flag && (createmode != EXCLUSIVE4) &&
1241 1245                      res.status == NFS4ERR_BADOWNER)
1242 1246                          nfs4_log_badowner(VTOMI4(dvp), OP_OPEN);
1243 1247  
1244 1248                  e.error = geterrno4(res.status);
1245 1249                  nfs4args_copen_free(open_args);
1246 1250                  if (setgid_flag) {
1247 1251                          nfs4args_verify_free(&argop[8]);
1248 1252                          nfs4args_setattr_free(&argop[9]);
1249 1253                  }
1250 1254                  (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1251 1255                  nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov);
1252 1256                  /*
1253 1257                   * If the reply is NFS4ERR_ACCESS, it may be because
1254 1258                   * we are root (no root net access).  If the real uid
1255 1259                   * is not root, then retry with the real uid instead.
1256 1260                   */
1257 1261                  if (ncr != NULL) {
1258 1262                          crfree(ncr);
1259 1263                          ncr = NULL;
1260 1264                  }
1261 1265                  if (res.status == NFS4ERR_ACCESS &&
1262 1266                      (ncr = crnetadjust(cred_otw)) != NULL) {
1263 1267                          cred_otw = ncr;
1264 1268                          goto recov_retry;
1265 1269                  }
1266 1270                  kmem_free(argop, argoplist_size);
1267 1271                  return (e.error);
1268 1272          }
1269 1273  
1270 1274          resop = &res.array[idx_open];  /* open res */
1271 1275          op_res = &resop->nfs_resop4_u.opopen;
1272 1276  
1273 1277  #ifdef DEBUG
1274 1278          /*
1275 1279           * verify attrset bitmap
1276 1280           */
1277 1281          if (create_flag &&
1278 1282              (createmode == UNCHECKED4 || createmode == GUARDED4)) {
1279 1283                  /* make sure attrset returned is what we asked for */
1280 1284                  /* XXX Ignore this 'error' for now */
1281 1285                  if (attr->attrmask != op_res->attrset)
1282 1286                          /* EMPTY */;
1283 1287          }
1284 1288  #endif
1285 1289  
1286 1290          if (op_res->rflags & OPEN4_RESULT_LOCKTYPE_POSIX) {
1287 1291                  mutex_enter(&VTOMI4(dvp)->mi_lock);
1288 1292                  VTOMI4(dvp)->mi_flags |= MI4_POSIX_LOCK;
1289 1293                  mutex_exit(&VTOMI4(dvp)->mi_lock);
1290 1294          }
1291 1295  
1292 1296          resop = &res.array[idx_open + 1];  /* getfh res */
1293 1297          gf_res = &resop->nfs_resop4_u.opgetfh;
1294 1298  
1295 1299          otw_sfh = sfh4_get(&gf_res->object, VTOMI4(dvp));
1296 1300  
1297 1301          /*
1298 1302           * The open stateid has been updated on the server but not
1299 1303           * on the client yet.  There is a path: makenfs4node->nfs4_attr_cache->
1300 1304           * flush_pages->VOP_PUTPAGE->...->nfs4write where we will issue an OTW
1301 1305           * WRITE call.  That, however, will use the old stateid, so go ahead
1302 1306           * and upate the open stateid now, before any call to makenfs4node.
1303 1307           */
1304 1308          if (vpi) {
1305 1309                  nfs4_open_stream_t      *tmp_osp;
1306 1310                  rnode4_t                *tmp_rp = VTOR4(vpi);
1307 1311  
1308 1312                  tmp_osp = find_open_stream(oop, tmp_rp);
1309 1313                  if (tmp_osp) {
1310 1314                          tmp_osp->open_stateid = op_res->stateid;
1311 1315                          mutex_exit(&tmp_osp->os_sync_lock);
1312 1316                          open_stream_rele(tmp_osp, tmp_rp);
1313 1317                  }
1314 1318  
1315 1319                  /*
1316 1320                   * We must determine if the file handle given by the otw open
1317 1321                   * is the same as the file handle which was passed in with
1318 1322                   * *vpp.  This case can be reached if the file we are trying
1319 1323                   * to open has been removed and another file has been created
1320 1324                   * having the same file name.  The passed in vnode is released
1321 1325                   * later.
1322 1326                   */
1323 1327                  orig_sfh = VTOR4(vpi)->r_fh;
1324 1328                  fh_differs = nfs4cmpfh(&orig_sfh->sfh_fh, &otw_sfh->sfh_fh);
1325 1329          }
1326 1330  
1327 1331          garp = &res.array[idx_fattr].nfs_resop4_u.opgetattr.ga_res;
1328 1332  
1329 1333          if (create_flag || fh_differs) {
1330 1334                  int rnode_err = 0;
1331 1335  
1332 1336                  vp = makenfs4node(otw_sfh, garp, dvp->v_vfsp, t, cr,
1333 1337                      dvp, fn_get(VTOSV(dvp)->sv_name, file_name, otw_sfh));
1334 1338  
1335 1339                  if (e.error)
1336 1340                          PURGE_ATTRCACHE4(vp);
1337 1341                  /*
1338 1342                   * For the newly created vp case, make sure the rnode
1339 1343                   * isn't bad before using it.
1340 1344                   */
1341 1345                  mutex_enter(&(VTOR4(vp))->r_statelock);
1342 1346                  if (VTOR4(vp)->r_flags & R4RECOVERR)
1343 1347                          rnode_err = EIO;
1344 1348                  mutex_exit(&(VTOR4(vp))->r_statelock);
1345 1349  
1346 1350                  if (rnode_err) {
1347 1351                          nfs4_end_open_seqid_sync(oop);
1348 1352                          nfs4args_copen_free(open_args);
1349 1353                          if (setgid_flag) {
1350 1354                                  nfs4args_verify_free(&argop[8]);
1351 1355                                  nfs4args_setattr_free(&argop[9]);
1352 1356                          }
1353 1357                          (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1354 1358                          nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state,
1355 1359                              needrecov);
1356 1360                          open_owner_rele(oop);
1357 1361                          VN_RELE(vp);
1358 1362                          if (ncr != NULL)
1359 1363                                  crfree(ncr);
1360 1364                          sfh4_rele(&otw_sfh);
1361 1365                          kmem_free(argop, argoplist_size);
1362 1366                          return (EIO);
1363 1367                  }
1364 1368          } else {
1365 1369                  vp = vpi;
1366 1370          }
1367 1371          sfh4_rele(&otw_sfh);
1368 1372  
1369 1373          /*
1370 1374           * It seems odd to get a full set of attrs and then not update
1371 1375           * the object's attrcache in the non-create case.  Create case uses
1372 1376           * the attrs since makenfs4node checks to see if the attrs need to
1373 1377           * be updated (and then updates them).  The non-create case should
1374 1378           * update attrs also.
1375 1379           */
1376 1380          if (! create_flag && ! fh_differs && !e.error) {
1377 1381                  nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL);
1378 1382          }
1379 1383  
1380 1384          nfs4_error_zinit(&e);
1381 1385          if (op_res->rflags & OPEN4_RESULT_CONFIRM) {
1382 1386                  /* This does not do recovery for vp explicitly. */
1383 1387                  nfs4open_confirm(vp, &seqid, &op_res->stateid, cred_otw, FALSE,
1384 1388                      &retry_open, oop, FALSE, &e, &num_bseqid_retry);
1385 1389  
1386 1390                  if (e.error || e.stat) {
1387 1391                          nfs4_end_open_seqid_sync(oop);
1388 1392                          nfs4args_copen_free(open_args);
1389 1393                          if (setgid_flag) {
1390 1394                                  nfs4args_verify_free(&argop[8]);
1391 1395                                  nfs4args_setattr_free(&argop[9]);
1392 1396                          }
1393 1397                          (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1394 1398                          nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state,
1395 1399                              needrecov);
1396 1400                          open_owner_rele(oop);
1397 1401                          if (create_flag || fh_differs) {
1398 1402                                  /* rele the makenfs4node */
1399 1403                                  VN_RELE(vp);
1400 1404                          }
1401 1405                          if (ncr != NULL) {
1402 1406                                  crfree(ncr);
1403 1407                                  ncr = NULL;
1404 1408                          }
1405 1409                          if (retry_open == TRUE) {
1406 1410                                  NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1407 1411                                      "nfs4open_otw: retry the open since OPEN "
1408 1412                                      "CONFIRM failed with error %d stat %d",
1409 1413                                      e.error, e.stat));
1410 1414                                  if (create_flag && createmode == GUARDED4) {
1411 1415                                          NFS4_DEBUG(nfs4_client_recov_debug,
1412 1416                                              (CE_NOTE, "nfs4open_otw: switch "
1413 1417                                              "createmode from GUARDED4 to "
1414 1418                                              "UNCHECKED4"));
1415 1419                                          createmode = UNCHECKED4;
1416 1420                                  }
1417 1421                                  goto recov_retry;
1418 1422                          }
1419 1423                          if (!e.error) {
1420 1424                                  if (create_flag && (createmode != EXCLUSIVE4) &&
1421 1425                                      e.stat == NFS4ERR_BADOWNER)
1422 1426                                          nfs4_log_badowner(VTOMI4(dvp), OP_OPEN);
1423 1427  
1424 1428                                  e.error = geterrno4(e.stat);
1425 1429                          }
1426 1430                          kmem_free(argop, argoplist_size);
1427 1431                          return (e.error);
1428 1432                  }
1429 1433          }
1430 1434  
1431 1435          rp = VTOR4(vp);
1432 1436  
1433 1437          mutex_enter(&rp->r_statev4_lock);
1434 1438          if (create_flag)
1435 1439                  rp->created_v4 = 1;
1436 1440          mutex_exit(&rp->r_statev4_lock);
1437 1441  
1438 1442          mutex_enter(&oop->oo_lock);
1439 1443          /* Doesn't matter if 'oo_just_created' already was set as this */
1440 1444          oop->oo_just_created = NFS4_PERM_CREATED;
1441 1445          if (oop->oo_cred_otw)
1442 1446                  crfree(oop->oo_cred_otw);
1443 1447          oop->oo_cred_otw = cred_otw;
1444 1448          crhold(oop->oo_cred_otw);
1445 1449          mutex_exit(&oop->oo_lock);
1446 1450  
1447 1451          /* returns with 'os_sync_lock' held */
1448 1452          osp = find_or_create_open_stream(oop, rp, &created_osp);
1449 1453          if (!osp) {
1450 1454                  NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
1451 1455                      "nfs4open_otw: failed to create an open stream"));
1452 1456                  NFS4_DEBUG(nfs4_seqid_sync, (CE_NOTE, "nfs4open_otw: "
1453 1457                      "signal our end of use of the open seqid"));
1454 1458  
1455 1459                  nfs4_end_open_seqid_sync(oop);
1456 1460                  open_owner_rele(oop);
1457 1461                  nfs4args_copen_free(open_args);
1458 1462                  if (setgid_flag) {
1459 1463                          nfs4args_verify_free(&argop[8]);
1460 1464                          nfs4args_setattr_free(&argop[9]);
1461 1465                  }
1462 1466                  (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1463 1467                  nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov);
1464 1468                  if (create_flag || fh_differs)
1465 1469                          VN_RELE(vp);
1466 1470                  if (ncr != NULL)
1467 1471                          crfree(ncr);
1468 1472  
1469 1473                  kmem_free(argop, argoplist_size);
1470 1474                  return (EINVAL);
1471 1475  
1472 1476          }
1473 1477  
1474 1478          osp->open_stateid = op_res->stateid;
1475 1479  
1476 1480          if (open_flag & FREAD)
1477 1481                  osp->os_share_acc_read++;
1478 1482          if (open_flag & FWRITE)
1479 1483                  osp->os_share_acc_write++;
1480 1484          osp->os_share_deny_none++;
1481 1485  
1482 1486          /*
1483 1487           * Need to reset this bitfield for the possible case where we were
1484 1488           * going to OTW CLOSE the file, got a non-recoverable error, and before
1485 1489           * we could retry the CLOSE, OPENed the file again.
1486 1490           */
1487 1491          ASSERT(osp->os_open_owner->oo_seqid_inuse);
1488 1492          osp->os_final_close = 0;
1489 1493          osp->os_force_close = 0;
1490 1494  #ifdef DEBUG
1491 1495          if (osp->os_failed_reopen)
1492 1496                  NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE, "nfs4open_otw:"
1493 1497                      " clearing os_failed_reopen for osp %p, cr %p, rp %s",
1494 1498                      (void *)osp, (void *)cr, rnode4info(rp)));
1495 1499  #endif
1496 1500          osp->os_failed_reopen = 0;
1497 1501  
1498 1502          mutex_exit(&osp->os_sync_lock);
1499 1503  
1500 1504          nfs4_end_open_seqid_sync(oop);
1501 1505  
1502 1506          if (created_osp && recov_state.rs_sp != NULL) {
1503 1507                  mutex_enter(&recov_state.rs_sp->s_lock);
1504 1508                  nfs4_inc_state_ref_count_nolock(recov_state.rs_sp, VTOMI4(dvp));
1505 1509                  mutex_exit(&recov_state.rs_sp->s_lock);
1506 1510          }
1507 1511  
1508 1512          /* get rid of our reference to find oop */
1509 1513          open_owner_rele(oop);
1510 1514  
1511 1515          open_stream_rele(osp, rp);
1512 1516  
1513 1517          /* accept delegation, if any */
1514 1518          nfs4_delegation_accept(rp, CLAIM_NULL, op_res, garp, cred_otw);
1515 1519  
1516 1520          nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov);
1517 1521  
1518 1522          if (createmode == EXCLUSIVE4 &&
1519 1523              (in_va->va_mask & ~(AT_GID | AT_SIZE))) {
1520 1524                  NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4open_otw:"
1521 1525                      " EXCLUSIVE4: sending a SETATTR"));
1522 1526                  /*
1523 1527                   * If doing an exclusive create, then generate
1524 1528                   * a SETATTR to set the initial attributes.
1525 1529                   * Try to set the mtime and the atime to the
1526 1530                   * server's current time.  It is somewhat
1527 1531                   * expected that these fields will be used to
1528 1532                   * store the exclusive create cookie.  If not,
1529 1533                   * server implementors will need to know that
1530 1534                   * a SETATTR will follow an exclusive create
1531 1535                   * and the cookie should be destroyed if
1532 1536                   * appropriate.
1533 1537                   *
1534 1538                   * The AT_GID and AT_SIZE bits are turned off
1535 1539                   * so that the SETATTR request will not attempt
1536 1540                   * to process these.  The gid will be set
1537 1541                   * separately if appropriate.  The size is turned
1538 1542                   * off because it is assumed that a new file will
1539 1543                   * be created empty and if the file wasn't empty,
1540 1544                   * then the exclusive create will have failed
1541 1545                   * because the file must have existed already.
1542 1546                   * Therefore, no truncate operation is needed.
1543 1547                   */
1544 1548                  in_va->va_mask &= ~(AT_GID | AT_SIZE);
1545 1549                  in_va->va_mask |= (AT_MTIME | AT_ATIME);
1546 1550  
1547 1551                  e.error = nfs4setattr(vp, in_va, 0, cr, NULL);
1548 1552                  if (e.error) {
1549 1553                          /*
1550 1554                           * Couldn't correct the attributes of
1551 1555                           * the newly created file and the
1552 1556                           * attributes are wrong.  Remove the
1553 1557                           * file and return an error to the
1554 1558                           * application.
1555 1559                           */
1556 1560                          /* XXX will this take care of client state ? */
1557 1561                          NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
1558 1562                              "nfs4open_otw: EXCLUSIVE4: error %d on SETATTR:"
1559 1563                              " remove file", e.error));
1560 1564                          VN_RELE(vp);
1561 1565                          (void) nfs4_remove(dvp, file_name, cr, NULL, 0);
1562 1566                          /*
1563 1567                           * Since we've reled the vnode and removed
1564 1568                           * the file we now need to return the error.
1565 1569                           * At this point we don't want to update the
1566 1570                           * dircaches, call nfs4_waitfor_purge_complete
1567 1571                           * or set vpp to vp so we need to skip these
1568 1572                           * as well.
1569 1573                           */
1570 1574                          goto skip_update_dircaches;
1571 1575                  }
1572 1576          }
1573 1577  
1574 1578          /*
1575 1579           * If we created or found the correct vnode, due to create_flag or
1576 1580           * fh_differs being set, then update directory cache attribute, readdir
1577 1581           * and dnlc caches.
1578 1582           */
1579 1583          if (create_flag || fh_differs) {
1580 1584                  dirattr_info_t dinfo, *dinfop;
1581 1585  
1582 1586                  /*
1583 1587                   * Make sure getattr succeeded before using results.
1584 1588                   * note: op 7 is getattr(dir) for both flavors of
1585 1589                   * open(create).
1586 1590                   */
1587 1591                  if (create_flag && res.status == NFS4_OK) {
1588 1592                          dinfo.di_time_call = t;
1589 1593                          dinfo.di_cred = cr;
1590 1594                          dinfo.di_garp =
1591 1595                              &res.array[6].nfs_resop4_u.opgetattr.ga_res;
1592 1596                          dinfop = &dinfo;
1593 1597                  } else {
1594 1598                          dinfop = NULL;
1595 1599                  }
1596 1600  
1597 1601                  nfs4_update_dircaches(&op_res->cinfo, dvp, vp, file_name,
1598 1602                      dinfop);
1599 1603          }
1600 1604  
1601 1605          /*
1602 1606           * If the page cache for this file was flushed from actions
1603 1607           * above, it was done asynchronously and if that is true,
1604 1608           * there is a need to wait here for it to complete.  This must
1605 1609           * be done outside of start_fop/end_fop.
1606 1610           */
1607 1611          (void) nfs4_waitfor_purge_complete(vp);
1608 1612  
1609 1613          /*
1610 1614           * It is implicit that we are in the open case (create_flag == 0) since
1611 1615           * fh_differs can only be set to a non-zero value in the open case.
1612 1616           */
1613 1617          if (fh_differs != 0 && vpi != NULL)
1614 1618                  VN_RELE(vpi);
1615 1619  
1616 1620          /*
1617 1621           * Be sure to set *vpp to the correct value before returning.
1618 1622           */
1619 1623          *vpp = vp;
1620 1624  
1621 1625  skip_update_dircaches:
1622 1626  
1623 1627          nfs4args_copen_free(open_args);
1624 1628          if (setgid_flag) {
1625 1629                  nfs4args_verify_free(&argop[8]);
1626 1630                  nfs4args_setattr_free(&argop[9]);
1627 1631          }
1628 1632          (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1629 1633  
1630 1634          if (ncr)
1631 1635                  crfree(ncr);
1632 1636          kmem_free(argop, argoplist_size);
1633 1637          return (e.error);
1634 1638  }
1635 1639  
1636 1640  /*
1637 1641   * Reopen an open instance.  cf. nfs4open_otw().
1638 1642   *
1639 1643   * Errors are returned by the nfs4_error_t parameter.
1640 1644   * - ep->error contains an errno value or zero.
1641 1645   * - if it is zero, ep->stat is set to an NFS status code, if any.
1642 1646   *   If the file could not be reopened, but the caller should continue, the
1643 1647   *   file is marked dead and no error values are returned.  If the caller
1644 1648   *   should stop recovering open files and start over, either the ep->error
1645 1649   *   value or ep->stat will indicate an error (either something that requires
1646 1650   *   recovery or EAGAIN).  Note that some recovery (e.g., expired volatile
1647 1651   *   filehandles) may be handled silently by this routine.
1648 1652   * - if it is EINTR, ETIMEDOUT, or NFS4_FRC_UNMT_ERR, recovery for lost state
1649 1653   *   will be started, so the caller should not do it.
1650 1654   *
1651 1655   * Gotos:
1652 1656   * - kill_file : reopen failed in such a fashion to constitute marking the
1653 1657   *    file dead and setting the open stream's 'os_failed_reopen' as 1.  This
1654 1658   *   is for cases where recovery is not possible.
1655 1659   * - failed_reopen : same as above, except that the file has already been
1656 1660   *   marked dead, so no need to do it again.
1657 1661   * - bailout : reopen failed but we are able to recover and retry the reopen -
1658 1662   *   either within this function immediately or via the calling function.
1659 1663   */
1660 1664  
1661 1665  void
1662 1666  nfs4_reopen(vnode_t *vp, nfs4_open_stream_t *osp, nfs4_error_t *ep,
1663 1667      open_claim_type4 claim, bool_t frc_use_claim_previous,
1664 1668      bool_t is_recov)
1665 1669  {
1666 1670          COMPOUND4args_clnt args;
1667 1671          COMPOUND4res_clnt res;
1668 1672          nfs_argop4 argop[4];
1669 1673          nfs_resop4 *resop;
1670 1674          OPEN4res *op_res = NULL;
1671 1675          OPEN4cargs *open_args;
1672 1676          GETFH4res *gf_res;
1673 1677          rnode4_t *rp = VTOR4(vp);
1674 1678          int doqueue = 1;
1675 1679          cred_t *cr = NULL, *cred_otw = NULL;
1676 1680          nfs4_open_owner_t *oop = NULL;
1677 1681          seqid4 seqid;
1678 1682          nfs4_ga_res_t *garp;
1679 1683          char fn[MAXNAMELEN];
1680 1684          nfs4_recov_state_t recov = {NULL, 0};
1681 1685          nfs4_lost_rqst_t lost_rqst;
1682 1686          mntinfo4_t *mi = VTOMI4(vp);
1683 1687          bool_t abort;
1684 1688          char *failed_msg = "";
1685 1689          int fh_different;
1686 1690          hrtime_t t;
1687 1691          nfs4_bseqid_entry_t *bsep = NULL;
1688 1692  
1689 1693          ASSERT(nfs4_consistent_type(vp));
1690 1694          ASSERT(nfs_zone() == mi->mi_zone);
1691 1695  
1692 1696          nfs4_error_zinit(ep);
1693 1697  
1694 1698          /* this is the cred used to find the open owner */
1695 1699          cr = state_to_cred(osp);
1696 1700          if (cr == NULL) {
1697 1701                  failed_msg = "Couldn't reopen: no cred";
1698 1702                  goto kill_file;
1699 1703          }
1700 1704          /* use this cred for OTW operations */
1701 1705          cred_otw = nfs4_get_otw_cred(cr, mi, osp->os_open_owner);
1702 1706  
1703 1707  top:
1704 1708          nfs4_error_zinit(ep);
1705 1709  
1706 1710          if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) {
1707 1711                  /* File system has been unmounted, quit */
1708 1712                  ep->error = EIO;
1709 1713                  failed_msg = "Couldn't reopen: file system has been unmounted";
1710 1714                  goto kill_file;
1711 1715          }
1712 1716  
1713 1717          oop = osp->os_open_owner;
1714 1718  
1715 1719          ASSERT(oop != NULL);
1716 1720          if (oop == NULL) {      /* be defensive in non-DEBUG */
1717 1721                  failed_msg = "can't reopen: no open owner";
1718 1722                  goto kill_file;
1719 1723          }
1720 1724          open_owner_hold(oop);
1721 1725  
1722 1726          ep->error = nfs4_start_open_seqid_sync(oop, mi);
1723 1727          if (ep->error) {
1724 1728                  open_owner_rele(oop);
1725 1729                  oop = NULL;
1726 1730                  goto bailout;
1727 1731          }
1728 1732  
1729 1733          /*
1730 1734           * If the rnode has a delegation and the delegation has been
1731 1735           * recovered and the server didn't request a recall and the caller
1732 1736           * didn't specifically ask for CLAIM_PREVIOUS (nfs4frlock during
1733 1737           * recovery) and the rnode hasn't been marked dead, then install
1734 1738           * the delegation stateid in the open stream.  Otherwise, proceed
1735 1739           * with a CLAIM_PREVIOUS or CLAIM_NULL OPEN.
1736 1740           */
1737 1741          mutex_enter(&rp->r_statev4_lock);
1738 1742          if (rp->r_deleg_type != OPEN_DELEGATE_NONE &&
1739 1743              !rp->r_deleg_return_pending &&
1740 1744              (rp->r_deleg_needs_recovery == OPEN_DELEGATE_NONE) &&
1741 1745              !rp->r_deleg_needs_recall &&
1742 1746              claim != CLAIM_DELEGATE_CUR && !frc_use_claim_previous &&
1743 1747              !(rp->r_flags & R4RECOVERR)) {
1744 1748                  mutex_enter(&osp->os_sync_lock);
1745 1749                  osp->os_delegation = 1;
1746 1750                  osp->open_stateid = rp->r_deleg_stateid;
1747 1751                  mutex_exit(&osp->os_sync_lock);
1748 1752                  mutex_exit(&rp->r_statev4_lock);
1749 1753                  goto bailout;
1750 1754          }
1751 1755          mutex_exit(&rp->r_statev4_lock);
1752 1756  
1753 1757          /*
1754 1758           * If the file failed recovery, just quit.  This failure need not
1755 1759           * affect other reopens, so don't return an error.
1756 1760           */
1757 1761          mutex_enter(&rp->r_statelock);
1758 1762          if (rp->r_flags & R4RECOVERR) {
1759 1763                  mutex_exit(&rp->r_statelock);
1760 1764                  ep->error = 0;
1761 1765                  goto failed_reopen;
1762 1766          }
1763 1767          mutex_exit(&rp->r_statelock);
1764 1768  
1765 1769          /*
1766 1770           * argop is empty here
1767 1771           *
1768 1772           * PUTFH, OPEN, GETATTR
1769 1773           */
1770 1774          args.ctag = TAG_REOPEN;
1771 1775          args.array_len = 4;
1772 1776          args.array = argop;
1773 1777  
1774 1778          NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
1775 1779              "nfs4_reopen: file is type %d, id %s",
1776 1780              vp->v_type, rnode4info(VTOR4(vp))));
1777 1781  
1778 1782          argop[0].argop = OP_CPUTFH;
1779 1783  
1780 1784          if (claim != CLAIM_PREVIOUS) {
1781 1785                  /*
1782 1786                   * if this is a file mount then
1783 1787                   * use the mntinfo parentfh
1784 1788                   */
1785 1789                  argop[0].nfs_argop4_u.opcputfh.sfh =
1786 1790                      (vp->v_flag & VROOT) ? mi->mi_srvparentfh :
1787 1791                      VTOSV(vp)->sv_dfh;
1788 1792          } else {
1789 1793                  /* putfh fh to reopen */
1790 1794                  argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
1791 1795          }
1792 1796  
1793 1797          argop[1].argop = OP_COPEN;
1794 1798          open_args = &argop[1].nfs_argop4_u.opcopen;
1795 1799          open_args->claim = claim;
1796 1800  
1797 1801          if (claim == CLAIM_NULL) {
1798 1802  
1799 1803                  if ((ep->error = vtoname(vp, fn, MAXNAMELEN)) != 0) {
1800 1804                          nfs_cmn_err(ep->error, CE_WARN, "nfs4_reopen: vtoname "
1801 1805                              "failed for vp 0x%p for CLAIM_NULL with %m",
1802 1806                              (void *)vp);
1803 1807                          failed_msg = "Couldn't reopen: vtoname failed for "
1804 1808                              "CLAIM_NULL";
1805 1809                          /* nothing allocated yet */
1806 1810                          goto kill_file;
1807 1811                  }
1808 1812  
1809 1813                  open_args->open_claim4_u.cfile = fn;
1810 1814          } else if (claim == CLAIM_PREVIOUS) {
1811 1815  
1812 1816                  /*
1813 1817                   * We have two cases to deal with here:
1814 1818                   * 1) We're being called to reopen files in order to satisfy
1815 1819                   *    a lock operation request which requires us to explicitly
1816 1820                   *    reopen files which were opened under a delegation.  If
1817 1821                   *    we're in recovery, we *must* use CLAIM_PREVIOUS.  In
1818 1822                   *    that case, frc_use_claim_previous is TRUE and we must
1819 1823                   *    use the rnode's current delegation type (r_deleg_type).
1820 1824                   * 2) We're reopening files during some form of recovery.
1821 1825                   *    In this case, frc_use_claim_previous is FALSE and we
1822 1826                   *    use the delegation type appropriate for recovery
1823 1827                   *    (r_deleg_needs_recovery).
1824 1828                   */
1825 1829                  mutex_enter(&rp->r_statev4_lock);
1826 1830                  open_args->open_claim4_u.delegate_type =
1827 1831                      frc_use_claim_previous ?
1828 1832                      rp->r_deleg_type :
1829 1833                      rp->r_deleg_needs_recovery;
1830 1834                  mutex_exit(&rp->r_statev4_lock);
1831 1835  
1832 1836          } else if (claim == CLAIM_DELEGATE_CUR) {
1833 1837  
1834 1838                  if ((ep->error = vtoname(vp, fn, MAXNAMELEN)) != 0) {
1835 1839                          nfs_cmn_err(ep->error, CE_WARN, "nfs4_reopen: vtoname "
1836 1840                              "failed for vp 0x%p for CLAIM_DELEGATE_CUR "
1837 1841                              "with %m", (void *)vp);
1838 1842                          failed_msg = "Couldn't reopen: vtoname failed for "
1839 1843                              "CLAIM_DELEGATE_CUR";
1840 1844                          /* nothing allocated yet */
1841 1845                          goto kill_file;
1842 1846                  }
1843 1847  
1844 1848                  mutex_enter(&rp->r_statev4_lock);
1845 1849                  open_args->open_claim4_u.delegate_cur_info.delegate_stateid =
1846 1850                      rp->r_deleg_stateid;
1847 1851                  mutex_exit(&rp->r_statev4_lock);
1848 1852  
1849 1853                  open_args->open_claim4_u.delegate_cur_info.cfile = fn;
1850 1854          }
1851 1855          open_args->opentype = OPEN4_NOCREATE;
1852 1856          open_args->owner.clientid = mi2clientid(mi);
1853 1857          open_args->owner.owner_len = sizeof (oop->oo_name);
1854 1858          open_args->owner.owner_val =
1855 1859              kmem_alloc(open_args->owner.owner_len, KM_SLEEP);
1856 1860          bcopy(&oop->oo_name, open_args->owner.owner_val,
1857 1861              open_args->owner.owner_len);
1858 1862          open_args->share_access = 0;
1859 1863          open_args->share_deny = 0;
1860 1864  
1861 1865          mutex_enter(&osp->os_sync_lock);
1862 1866          NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, "nfs4_reopen: osp %p rp "
1863 1867              "%p: read acc %"PRIu64" write acc %"PRIu64": open ref count %d: "
1864 1868              "mmap read %"PRIu64" mmap write %"PRIu64" claim %d ",
1865 1869              (void *)osp, (void *)rp, osp->os_share_acc_read,
1866 1870              osp->os_share_acc_write, osp->os_open_ref_count,
1867 1871              osp->os_mmap_read, osp->os_mmap_write, claim));
1868 1872  
1869 1873          if (osp->os_share_acc_read || osp->os_mmap_read)
1870 1874                  open_args->share_access |= OPEN4_SHARE_ACCESS_READ;
1871 1875          if (osp->os_share_acc_write || osp->os_mmap_write)
1872 1876                  open_args->share_access |= OPEN4_SHARE_ACCESS_WRITE;
1873 1877          if (osp->os_share_deny_read)
1874 1878                  open_args->share_deny |= OPEN4_SHARE_DENY_READ;
1875 1879          if (osp->os_share_deny_write)
1876 1880                  open_args->share_deny |= OPEN4_SHARE_DENY_WRITE;
1877 1881          mutex_exit(&osp->os_sync_lock);
1878 1882  
1879 1883          seqid = nfs4_get_open_seqid(oop) + 1;
1880 1884          open_args->seqid = seqid;
1881 1885  
1882 1886          /* Construct the getfh part of the compound */
1883 1887          argop[2].argop = OP_GETFH;
1884 1888  
1885 1889          /* Construct the getattr part of the compound */
1886 1890          argop[3].argop = OP_GETATTR;
1887 1891          argop[3].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
1888 1892          argop[3].nfs_argop4_u.opgetattr.mi = mi;
1889 1893  
1890 1894          t = gethrtime();
1891 1895  
1892 1896          rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, ep);
1893 1897  
1894 1898          if (ep->error) {
1895 1899                  if (!is_recov && !frc_use_claim_previous &&
1896 1900                      (ep->error == EINTR || ep->error == ETIMEDOUT ||
1897 1901                      NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp))) {
1898 1902                          nfs4open_save_lost_rqst(ep->error, &lost_rqst, oop,
1899 1903                              cred_otw, vp, NULL, open_args);
1900 1904                          abort = nfs4_start_recovery(ep,
1901 1905                              VTOMI4(vp), vp, NULL, NULL,
1902 1906                              lost_rqst.lr_op == OP_OPEN ?
1903 1907                              &lost_rqst : NULL, OP_OPEN, NULL, NULL, NULL);
1904 1908                          nfs4args_copen_free(open_args);
1905 1909                          goto bailout;
1906 1910                  }
1907 1911  
1908 1912                  nfs4args_copen_free(open_args);
1909 1913  
1910 1914                  if (ep->error == EACCES && cred_otw != cr) {
1911 1915                          crfree(cred_otw);
1912 1916                          cred_otw = cr;
1913 1917                          crhold(cred_otw);
1914 1918                          nfs4_end_open_seqid_sync(oop);
1915 1919                          open_owner_rele(oop);
1916 1920                          oop = NULL;
1917 1921                          goto top;
1918 1922                  }
1919 1923                  if (ep->error == ETIMEDOUT)
1920 1924                          goto bailout;
1921 1925                  failed_msg = "Couldn't reopen: rpc error";
1922 1926                  goto kill_file;
1923 1927          }
1924 1928  
1925 1929          if (nfs4_need_to_bump_seqid(&res))
1926 1930                  nfs4_set_open_seqid(seqid, oop, args.ctag);
1927 1931  
1928 1932          switch (res.status) {
1929 1933          case NFS4_OK:
1930 1934                  if (recov.rs_flags & NFS4_RS_DELAY_MSG) {
1931 1935                          mutex_enter(&rp->r_statelock);
1932 1936                          rp->r_delay_interval = 0;
1933 1937                          mutex_exit(&rp->r_statelock);
1934 1938                  }
1935 1939                  break;
1936 1940          case NFS4ERR_BAD_SEQID:
1937 1941                  bsep = nfs4_create_bseqid_entry(oop, NULL, vp, 0,
1938 1942                      args.ctag, open_args->seqid);
1939 1943  
1940 1944                  abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL,
1941 1945                      NULL, lost_rqst.lr_op == OP_OPEN ? &lost_rqst :
1942 1946                      NULL, OP_OPEN, bsep, NULL, NULL);
1943 1947  
1944 1948                  nfs4args_copen_free(open_args);
1945 1949                  (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1946 1950                  nfs4_end_open_seqid_sync(oop);
1947 1951                  open_owner_rele(oop);
1948 1952                  oop = NULL;
1949 1953                  kmem_free(bsep, sizeof (*bsep));
1950 1954  
1951 1955                  goto kill_file;
1952 1956          case NFS4ERR_NO_GRACE:
1953 1957                  nfs4args_copen_free(open_args);
1954 1958                  (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1955 1959                  nfs4_end_open_seqid_sync(oop);
1956 1960                  open_owner_rele(oop);
1957 1961                  oop = NULL;
1958 1962                  if (claim == CLAIM_PREVIOUS) {
1959 1963                          /*
1960 1964                           * Retry as a plain open. We don't need to worry about
1961 1965                           * checking the changeinfo: it is acceptable for a
1962 1966                           * client to re-open a file and continue processing
1963 1967                           * (in the absence of locks).
1964 1968                           */
1965 1969                          NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1966 1970                              "nfs4_reopen: CLAIM_PREVIOUS: NFS4ERR_NO_GRACE; "
1967 1971                              "will retry as CLAIM_NULL"));
1968 1972                          claim = CLAIM_NULL;
1969 1973                          nfs4_mi_kstat_inc_no_grace(mi);
1970 1974                          goto top;
1971 1975                  }
1972 1976                  failed_msg =
1973 1977                      "Couldn't reopen: tried reclaim outside grace period. ";
1974 1978                  goto kill_file;
1975 1979          case NFS4ERR_GRACE:
1976 1980                  nfs4_set_grace_wait(mi);
1977 1981                  nfs4args_copen_free(open_args);
1978 1982                  (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1979 1983                  nfs4_end_open_seqid_sync(oop);
1980 1984                  open_owner_rele(oop);
1981 1985                  oop = NULL;
1982 1986                  ep->error = nfs4_wait_for_grace(mi, &recov);
1983 1987                  if (ep->error != 0)
1984 1988                          goto bailout;
1985 1989                  goto top;
1986 1990          case NFS4ERR_DELAY:
1987 1991                  nfs4_set_delay_wait(vp);
1988 1992                  nfs4args_copen_free(open_args);
1989 1993                  (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1990 1994                  nfs4_end_open_seqid_sync(oop);
1991 1995                  open_owner_rele(oop);
1992 1996                  oop = NULL;
1993 1997                  ep->error = nfs4_wait_for_delay(vp, &recov);
1994 1998                  nfs4_mi_kstat_inc_delay(mi);
1995 1999                  if (ep->error != 0)
1996 2000                          goto bailout;
1997 2001                  goto top;
1998 2002          case NFS4ERR_FHEXPIRED:
1999 2003                  /* recover filehandle and retry */
2000 2004                  abort = nfs4_start_recovery(ep,
2001 2005                      mi, vp, NULL, NULL, NULL, OP_OPEN, NULL, NULL, NULL);
2002 2006                  nfs4args_copen_free(open_args);
2003 2007                  (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2004 2008                  nfs4_end_open_seqid_sync(oop);
2005 2009                  open_owner_rele(oop);
2006 2010                  oop = NULL;
2007 2011                  if (abort == FALSE)
2008 2012                          goto top;
2009 2013                  failed_msg = "Couldn't reopen: recovery aborted";
2010 2014                  goto kill_file;
2011 2015          case NFS4ERR_RESOURCE:
2012 2016          case NFS4ERR_STALE_CLIENTID:
2013 2017          case NFS4ERR_WRONGSEC:
2014 2018          case NFS4ERR_EXPIRED:
2015 2019                  /*
2016 2020                   * Do not mark the file dead and let the calling
2017 2021                   * function initiate recovery.
2018 2022                   */
2019 2023                  nfs4args_copen_free(open_args);
2020 2024                  (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2021 2025                  nfs4_end_open_seqid_sync(oop);
2022 2026                  open_owner_rele(oop);
2023 2027                  oop = NULL;
2024 2028                  goto bailout;
2025 2029          case NFS4ERR_ACCESS:
2026 2030                  if (cred_otw != cr) {
2027 2031                          crfree(cred_otw);
2028 2032                          cred_otw = cr;
2029 2033                          crhold(cred_otw);
2030 2034                          nfs4args_copen_free(open_args);
2031 2035                          (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2032 2036                          nfs4_end_open_seqid_sync(oop);
2033 2037                          open_owner_rele(oop);
2034 2038                          oop = NULL;
2035 2039                          goto top;
2036 2040                  }
2037 2041                  /* fall through */
2038 2042          default:
2039 2043                  NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
2040 2044                      "nfs4_reopen: r_server 0x%p, mi_curr_serv 0x%p, rnode %s",
2041 2045                      (void*)VTOR4(vp)->r_server, (void*)mi->mi_curr_serv,
2042 2046                      rnode4info(VTOR4(vp))));
2043 2047                  failed_msg = "Couldn't reopen: NFSv4 error";
2044 2048                  nfs4args_copen_free(open_args);
2045 2049                  (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2046 2050                  goto kill_file;
2047 2051          }
2048 2052  
2049 2053          resop = &res.array[1];  /* open res */
2050 2054          op_res = &resop->nfs_resop4_u.opopen;
2051 2055  
2052 2056          garp = &res.array[3].nfs_resop4_u.opgetattr.ga_res;
2053 2057  
2054 2058          /*
2055 2059           * Check if the path we reopened really is the same
2056 2060           * file. We could end up in a situation where the file
2057 2061           * was removed and a new file created with the same name.
2058 2062           */
2059 2063          resop = &res.array[2];
2060 2064          gf_res = &resop->nfs_resop4_u.opgetfh;
2061 2065          (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0);
2062 2066          fh_different = (nfs4cmpfh(&rp->r_fh->sfh_fh, &gf_res->object) != 0);
2063 2067          if (fh_different) {
2064 2068                  if (mi->mi_fh_expire_type == FH4_PERSISTENT ||
2065 2069                      mi->mi_fh_expire_type & FH4_NOEXPIRE_WITH_OPEN) {
2066 2070                          /* Oops, we don't have the same file */
2067 2071                          if (mi->mi_fh_expire_type == FH4_PERSISTENT)
2068 2072                                  failed_msg = "Couldn't reopen: Persistent "
2069 2073                                      "file handle changed";
2070 2074                          else
2071 2075                                  failed_msg = "Couldn't reopen: Volatile "
2072 2076                                      "(no expire on open) file handle changed";
2073 2077  
2074 2078                          nfs4args_copen_free(open_args);
2075 2079                          (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2076 2080                          nfs_rw_exit(&mi->mi_fh_lock);
2077 2081                          goto kill_file;
2078 2082  
2079 2083                  } else {
2080 2084                          /*
2081 2085                           * We have volatile file handles that don't compare.
2082 2086                           * If the fids are the same then we assume that the
2083 2087                           * file handle expired but the rnode still refers to
2084 2088                           * the same file object.
2085 2089                           *
2086 2090                           * First check that we have fids or not.
2087 2091                           * If we don't we have a dumb server so we will
2088 2092                           * just assume every thing is ok for now.
2089 2093                           */
2090 2094                          if (!ep->error && garp->n4g_va.va_mask & AT_NODEID &&
2091 2095                              rp->r_attr.va_mask & AT_NODEID &&
2092 2096                              rp->r_attr.va_nodeid != garp->n4g_va.va_nodeid) {
2093 2097                                  /*
2094 2098                                   * We have fids, but they don't
2095 2099                                   * compare. So kill the file.
2096 2100                                   */
2097 2101                                  failed_msg =
2098 2102                                      "Couldn't reopen: file handle changed"
2099 2103                                      " due to mismatched fids";
2100 2104                                  nfs4args_copen_free(open_args);
2101 2105                                  (void) xdr_free(xdr_COMPOUND4res_clnt,
2102 2106                                      (caddr_t)&res);
2103 2107                                  nfs_rw_exit(&mi->mi_fh_lock);
2104 2108                                  goto kill_file;
2105 2109                          } else {
2106 2110                                  /*
2107 2111                                   * We have volatile file handles that refers
2108 2112                                   * to the same file (at least they have the
2109 2113                                   * same fid) or we don't have fids so we
2110 2114                                   * can't tell. :(. We'll be a kind and accepting
2111 2115                                   * client so we'll update the rnode's file
2112 2116                                   * handle with the otw handle.
2113 2117                                   *
2114 2118                                   * We need to drop mi->mi_fh_lock since
2115 2119                                   * sh4_update acquires it. Since there is
2116 2120                                   * only one recovery thread there is no
2117 2121                                   * race.
2118 2122                                   */
2119 2123                                  nfs_rw_exit(&mi->mi_fh_lock);
2120 2124                                  sfh4_update(rp->r_fh, &gf_res->object);
2121 2125                          }
2122 2126                  }
2123 2127          } else {
2124 2128                  nfs_rw_exit(&mi->mi_fh_lock);
2125 2129          }
2126 2130  
2127 2131          ASSERT(nfs4_consistent_type(vp));
2128 2132  
2129 2133          /*
2130 2134           * If the server wanted an OPEN_CONFIRM but that fails, just start
2131 2135           * over.  Presumably if there is a persistent error it will show up
2132 2136           * when we resend the OPEN.
2133 2137           */
2134 2138          if (op_res->rflags & OPEN4_RESULT_CONFIRM) {
2135 2139                  bool_t retry_open = FALSE;
2136 2140  
2137 2141                  nfs4open_confirm(vp, &seqid, &op_res->stateid,
2138 2142                      cred_otw, is_recov, &retry_open,
2139 2143                      oop, FALSE, ep, NULL);
2140 2144                  if (ep->error || ep->stat) {
2141 2145                          nfs4args_copen_free(open_args);
2142 2146                          (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2143 2147                          nfs4_end_open_seqid_sync(oop);
2144 2148                          open_owner_rele(oop);
2145 2149                          oop = NULL;
2146 2150                          goto top;
2147 2151                  }
2148 2152          }
2149 2153  
2150 2154          mutex_enter(&osp->os_sync_lock);
2151 2155          osp->open_stateid = op_res->stateid;
2152 2156          osp->os_delegation = 0;
2153 2157          /*
2154 2158           * Need to reset this bitfield for the possible case where we were
2155 2159           * going to OTW CLOSE the file, got a non-recoverable error, and before
2156 2160           * we could retry the CLOSE, OPENed the file again.
2157 2161           */
2158 2162          ASSERT(osp->os_open_owner->oo_seqid_inuse);
2159 2163          osp->os_final_close = 0;
2160 2164          osp->os_force_close = 0;
2161 2165          if (claim == CLAIM_DELEGATE_CUR || claim == CLAIM_PREVIOUS)
2162 2166                  osp->os_dc_openacc = open_args->share_access;
2163 2167          mutex_exit(&osp->os_sync_lock);
2164 2168  
2165 2169          nfs4_end_open_seqid_sync(oop);
2166 2170  
2167 2171          /* accept delegation, if any */
2168 2172          nfs4_delegation_accept(rp, claim, op_res, garp, cred_otw);
2169 2173  
2170 2174          nfs4args_copen_free(open_args);
2171 2175  
2172 2176          nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL);
2173 2177  
2174 2178          (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2175 2179  
2176 2180          ASSERT(nfs4_consistent_type(vp));
2177 2181  
2178 2182          open_owner_rele(oop);
2179 2183          crfree(cr);
2180 2184          crfree(cred_otw);
2181 2185          return;
2182 2186  
2183 2187  kill_file:
2184 2188          nfs4_fail_recov(vp, failed_msg, ep->error, ep->stat);
2185 2189  failed_reopen:
2186 2190          NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE,
2187 2191              "nfs4_reopen: setting os_failed_reopen for osp %p, cr %p, rp %s",
2188 2192              (void *)osp, (void *)cr, rnode4info(rp)));
2189 2193          mutex_enter(&osp->os_sync_lock);
2190 2194          osp->os_failed_reopen = 1;
2191 2195          mutex_exit(&osp->os_sync_lock);
2192 2196  bailout:
2193 2197          if (oop != NULL) {
2194 2198                  nfs4_end_open_seqid_sync(oop);
2195 2199                  open_owner_rele(oop);
2196 2200          }
2197 2201          if (cr != NULL)
2198 2202                  crfree(cr);
2199 2203          if (cred_otw != NULL)
2200 2204                  crfree(cred_otw);
2201 2205  }
2202 2206  
2203 2207  /* for . and .. OPENs */
2204 2208  /* ARGSUSED */
2205 2209  static int
2206 2210  nfs4_open_non_reg_file(vnode_t **vpp, int flag, cred_t *cr)
2207 2211  {
2208 2212          rnode4_t *rp;
2209 2213          nfs4_ga_res_t gar;
2210 2214  
2211 2215          ASSERT(nfs_zone() == VTOMI4(*vpp)->mi_zone);
2212 2216  
2213 2217          /*
2214 2218           * If close-to-open consistency checking is turned off or
2215 2219           * if there is no cached data, we can avoid
2216 2220           * the over the wire getattr.  Otherwise, force a
2217 2221           * call to the server to get fresh attributes and to
2218 2222           * check caches. This is required for close-to-open
2219 2223           * consistency.
2220 2224           */
2221 2225          rp = VTOR4(*vpp);
2222 2226          if (VTOMI4(*vpp)->mi_flags & MI4_NOCTO ||
2223 2227              (rp->r_dir == NULL && !nfs4_has_pages(*vpp)))
2224 2228                  return (0);
2225 2229  
2226 2230          gar.n4g_va.va_mask = AT_ALL;
2227 2231          return (nfs4_getattr_otw(*vpp, &gar, cr, 0));
2228 2232  }
2229 2233  
2230 2234  /*
2231 2235   * CLOSE a file
2232 2236   */
2233 2237  /* ARGSUSED */
2234 2238  static int
2235 2239  nfs4_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
2236 2240          caller_context_t *ct)
2237 2241  {
2238 2242          rnode4_t        *rp;
2239 2243          int              error = 0;
2240 2244          int              r_error = 0;
2241 2245          int              n4error = 0;
2242 2246          nfs4_error_t     e = { 0, NFS4_OK, RPC_SUCCESS };
2243 2247  
2244 2248          /*
2245 2249           * Remove client state for this (lockowner, file) pair.
2246 2250           * Issue otw v4 call to have the server do the same.
2247 2251           */
2248 2252  
2249 2253          rp = VTOR4(vp);
2250 2254  
2251 2255          /*
2252 2256           * zone_enter(2) prevents processes from changing zones with NFS files
2253 2257           * open; if we happen to get here from the wrong zone we can't do
2254 2258           * anything over the wire.
2255 2259           */
2256 2260          if (VTOMI4(vp)->mi_zone != nfs_zone()) {
2257 2261                  /*
2258 2262                   * We could attempt to clean up locks, except we're sure
2259 2263                   * that the current process didn't acquire any locks on
2260 2264                   * the file: any attempt to lock a file belong to another zone
2261 2265                   * will fail, and one can't lock an NFS file and then change
2262 2266                   * zones, as that fails too.
2263 2267                   *
2264 2268                   * Returning an error here is the sane thing to do.  A
2265 2269                   * subsequent call to VN_RELE() which translates to a
2266 2270                   * nfs4_inactive() will clean up state: if the zone of the
2267 2271                   * vnode's origin is still alive and kicking, the inactive
2268 2272                   * thread will handle the request (from the correct zone), and
2269 2273                   * everything (minus the OTW close call) should be OK.  If the
2270 2274                   * zone is going away nfs4_async_inactive() will throw away
2271 2275                   * delegations, open streams and cached pages inline.
2272 2276                   */
2273 2277                  return (EIO);
2274 2278          }
2275 2279  
2276 2280          /*
2277 2281           * If we are using local locking for this filesystem, then
2278 2282           * release all of the SYSV style record locks.  Otherwise,
2279 2283           * we are doing network locking and we need to release all
2280 2284           * of the network locks.  All of the locks held by this
2281 2285           * process on this file are released no matter what the
2282 2286           * incoming reference count is.
2283 2287           */
2284 2288          if (VTOMI4(vp)->mi_flags & MI4_LLOCK) {
2285 2289                  cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
2286 2290                  cleanshares(vp, ttoproc(curthread)->p_pid);
2287 2291          } else
2288 2292                  e.error = nfs4_lockrelease(vp, flag, offset, cr);
2289 2293  
2290 2294          if (e.error) {
2291 2295                  struct lm_sysid *lmsid;
2292 2296                  lmsid = nfs4_find_sysid(VTOMI4(vp));
2293 2297                  if (lmsid == NULL) {
2294 2298                          DTRACE_PROBE2(unknown__sysid, int, e.error,
2295 2299                              vnode_t *, vp);
2296 2300                  } else {
2297 2301                          cleanlocks(vp, ttoproc(curthread)->p_pid,
2298 2302                              (lm_sysidt(lmsid) | LM_SYSID_CLIENT));
2299 2303                  }
2300 2304                  return (e.error);
2301 2305          }
2302 2306  
2303 2307          if (count > 1)
2304 2308                  return (0);
2305 2309  
2306 2310          /*
2307 2311           * If the file has been `unlinked', then purge the
2308 2312           * DNLC so that this vnode will get reycled quicker
2309 2313           * and the .nfs* file on the server will get removed.
2310 2314           */
2311 2315          if (rp->r_unldvp != NULL)
2312 2316                  dnlc_purge_vp(vp);
2313 2317  
2314 2318          /*
2315 2319           * If the file was open for write and there are pages,
2316 2320           * do a synchronous flush and commit of all of the
2317 2321           * dirty and uncommitted pages.
2318 2322           */
2319 2323          ASSERT(!e.error);
2320 2324          if ((flag & FWRITE) && nfs4_has_pages(vp))
2321 2325                  error = nfs4_putpage_commit(vp, 0, 0, cr);
2322 2326  
2323 2327          mutex_enter(&rp->r_statelock);
2324 2328          r_error = rp->r_error;
2325 2329          rp->r_error = 0;
2326 2330          mutex_exit(&rp->r_statelock);
2327 2331  
2328 2332          /*
2329 2333           * If this file type is one for which no explicit 'open' was
2330 2334           * done, then bail now (ie. no need for protocol 'close'). If
2331 2335           * there was an error w/the vm subsystem, return _that_ error,
2332 2336           * otherwise, return any errors that may've been reported via
2333 2337           * the rnode.
2334 2338           */
2335 2339          if (vp->v_type != VREG)
2336 2340                  return (error ? error : r_error);
2337 2341  
2338 2342          /*
2339 2343           * The sync putpage commit may have failed above, but since
2340 2344           * we're working w/a regular file, we need to do the protocol
2341 2345           * 'close' (nfs4close_one will figure out if an otw close is
2342 2346           * needed or not). Report any errors _after_ doing the protocol
2343 2347           * 'close'.
2344 2348           */
2345 2349          nfs4close_one(vp, NULL, cr, flag, NULL, &e, CLOSE_NORM, 0, 0, 0);
2346 2350          n4error = e.error ? e.error : geterrno4(e.stat);
2347 2351  
2348 2352          /*
2349 2353           * Error reporting prio (Hi -> Lo)
2350 2354           *
2351 2355           *   i) nfs4_putpage_commit (error)
2352 2356           *  ii) rnode's (r_error)
2353 2357           * iii) nfs4close_one (n4error)
2354 2358           */
2355 2359          return (error ? error : (r_error ? r_error : n4error));
2356 2360  }
2357 2361  
2358 2362  /*
2359 2363   * Initialize *lost_rqstp.
2360 2364   */
2361 2365  
2362 2366  static void
2363 2367  nfs4close_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp,
2364 2368      nfs4_open_owner_t *oop, nfs4_open_stream_t *osp, cred_t *cr,
2365 2369      vnode_t *vp)
2366 2370  {
2367 2371          if (error != ETIMEDOUT && error != EINTR &&
2368 2372              !NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) {
2369 2373                  lost_rqstp->lr_op = 0;
2370 2374                  return;
2371 2375          }
2372 2376  
2373 2377          NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
2374 2378              "nfs4close_save_lost_rqst: error %d", error));
2375 2379  
2376 2380          lost_rqstp->lr_op = OP_CLOSE;
2377 2381          /*
2378 2382           * The vp is held and rele'd via the recovery code.
2379 2383           * See nfs4_save_lost_rqst.
2380 2384           */
2381 2385          lost_rqstp->lr_vp = vp;
2382 2386          lost_rqstp->lr_dvp = NULL;
2383 2387          lost_rqstp->lr_oop = oop;
2384 2388          lost_rqstp->lr_osp = osp;
2385 2389          ASSERT(osp != NULL);
2386 2390          ASSERT(mutex_owned(&osp->os_sync_lock));
2387 2391          osp->os_pending_close = 1;
2388 2392          lost_rqstp->lr_lop = NULL;
2389 2393          lost_rqstp->lr_cr = cr;
2390 2394          lost_rqstp->lr_flk = NULL;
2391 2395          lost_rqstp->lr_putfirst = FALSE;
2392 2396  }
2393 2397  
2394 2398  /*
2395 2399   * Assumes you already have the open seqid sync grabbed as well as the
2396 2400   * 'os_sync_lock'.  Note: this will release the open seqid sync and
2397 2401   * 'os_sync_lock' if client recovery starts.  Calling functions have to
2398 2402   * be prepared to handle this.
2399 2403   *
2400 2404   * 'recov' is returned as 1 if the CLOSE operation detected client recovery
2401 2405   * was needed and was started, and that the calling function should retry
2402 2406   * this function; otherwise it is returned as 0.
2403 2407   *
2404 2408   * Errors are returned via the nfs4_error_t parameter.
2405 2409   */
2406 2410  static void
2407 2411  nfs4close_otw(rnode4_t *rp, cred_t *cred_otw, nfs4_open_owner_t *oop,
2408 2412      nfs4_open_stream_t *osp, int *recov, int *did_start_seqid_syncp,
2409 2413      nfs4_close_type_t close_type, nfs4_error_t *ep, int *have_sync_lockp)
2410 2414  {
2411 2415          COMPOUND4args_clnt args;
2412 2416          COMPOUND4res_clnt res;
2413 2417          CLOSE4args *close_args;
2414 2418          nfs_resop4 *resop;
2415 2419          nfs_argop4 argop[3];
2416 2420          int doqueue = 1;
2417 2421          mntinfo4_t *mi;
2418 2422          seqid4 seqid;
2419 2423          vnode_t *vp;
2420 2424          bool_t needrecov = FALSE;
2421 2425          nfs4_lost_rqst_t lost_rqst;
2422 2426          hrtime_t t;
2423 2427  
2424 2428          ASSERT(nfs_zone() == VTOMI4(RTOV4(rp))->mi_zone);
2425 2429  
2426 2430          ASSERT(MUTEX_HELD(&osp->os_sync_lock));
2427 2431  
2428 2432          NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4close_otw"));
2429 2433  
2430 2434          /* Only set this to 1 if recovery is started */
2431 2435          *recov = 0;
2432 2436  
2433 2437          /* do the OTW call to close the file */
2434 2438  
2435 2439          if (close_type == CLOSE_RESEND)
2436 2440                  args.ctag = TAG_CLOSE_LOST;
2437 2441          else if (close_type == CLOSE_AFTER_RESEND)
2438 2442                  args.ctag = TAG_CLOSE_UNDO;
2439 2443          else
2440 2444                  args.ctag = TAG_CLOSE;
2441 2445  
2442 2446          args.array_len = 3;
2443 2447          args.array = argop;
2444 2448  
2445 2449          vp = RTOV4(rp);
2446 2450  
2447 2451          mi = VTOMI4(vp);
2448 2452  
2449 2453          /* putfh target fh */
2450 2454          argop[0].argop = OP_CPUTFH;
2451 2455          argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
2452 2456  
2453 2457          argop[1].argop = OP_GETATTR;
2454 2458          argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
2455 2459          argop[1].nfs_argop4_u.opgetattr.mi = mi;
2456 2460  
2457 2461          argop[2].argop = OP_CLOSE;
2458 2462          close_args = &argop[2].nfs_argop4_u.opclose;
2459 2463  
2460 2464          seqid = nfs4_get_open_seqid(oop) + 1;
2461 2465  
2462 2466          close_args->seqid = seqid;
2463 2467          close_args->open_stateid = osp->open_stateid;
2464 2468  
2465 2469          NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
2466 2470              "nfs4close_otw: %s call, rp %s", needrecov ? "recov" : "first",
2467 2471              rnode4info(rp)));
2468 2472  
2469 2473          t = gethrtime();
2470 2474  
2471 2475          rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, ep);
2472 2476  
2473 2477          if (!ep->error && nfs4_need_to_bump_seqid(&res)) {
2474 2478                  nfs4_set_open_seqid(seqid, oop, args.ctag);
2475 2479          }
2476 2480  
2477 2481          needrecov = nfs4_needs_recovery(ep, TRUE, mi->mi_vfsp);
2478 2482          if (ep->error && !needrecov) {
2479 2483                  /*
2480 2484                   * if there was an error and no recovery is to be done
2481 2485                   * then then set up the file to flush its cache if
2482 2486                   * needed for the next caller.
2483 2487                   */
2484 2488                  mutex_enter(&rp->r_statelock);
2485 2489                  PURGE_ATTRCACHE4_LOCKED(rp);
2486 2490                  rp->r_flags &= ~R4WRITEMODIFIED;
2487 2491                  mutex_exit(&rp->r_statelock);
2488 2492                  return;
2489 2493          }
2490 2494  
2491 2495          if (needrecov) {
2492 2496                  bool_t abort;
2493 2497                  nfs4_bseqid_entry_t *bsep = NULL;
2494 2498  
2495 2499                  if (close_type != CLOSE_RESEND)
2496 2500                          nfs4close_save_lost_rqst(ep->error, &lost_rqst, oop,
2497 2501                              osp, cred_otw, vp);
2498 2502  
2499 2503                  if (!ep->error && res.status == NFS4ERR_BAD_SEQID)
2500 2504                          bsep = nfs4_create_bseqid_entry(oop, NULL, vp,
2501 2505                              0, args.ctag, close_args->seqid);
2502 2506  
2503 2507                  NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2504 2508                      "nfs4close_otw: initiating recovery. error %d "
2505 2509                      "res.status %d", ep->error, res.status));
2506 2510  
2507 2511                  /*
2508 2512                   * Drop the 'os_sync_lock' here so we don't hit
2509 2513                   * a potential recursive mutex_enter via an
2510 2514                   * 'open_stream_hold()'.
2511 2515                   */
2512 2516                  mutex_exit(&osp->os_sync_lock);
2513 2517                  *have_sync_lockp = 0;
2514 2518                  abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, NULL,
2515 2519                      (close_type != CLOSE_RESEND &&
2516 2520                      lost_rqst.lr_op == OP_CLOSE) ? &lost_rqst : NULL,
2517 2521                      OP_CLOSE, bsep, NULL, NULL);
2518 2522  
2519 2523                  /* drop open seq sync, and let the calling function regrab it */
2520 2524                  nfs4_end_open_seqid_sync(oop);
2521 2525                  *did_start_seqid_syncp = 0;
2522 2526  
2523 2527                  if (bsep)
2524 2528                          kmem_free(bsep, sizeof (*bsep));
2525 2529                  /*
2526 2530                   * For signals, the caller wants to quit, so don't say to
2527 2531                   * retry.  For forced unmount, if it's a user thread, it
2528 2532                   * wants to quit.  If it's a recovery thread, the retry
2529 2533                   * will happen higher-up on the call stack.  Either way,
2530 2534                   * don't say to retry.
2531 2535                   */
2532 2536                  if (abort == FALSE && ep->error != EINTR &&
2533 2537                      !NFS4_FRC_UNMT_ERR(ep->error, mi->mi_vfsp) &&
2534 2538                      close_type != CLOSE_RESEND &&
2535 2539                      close_type != CLOSE_AFTER_RESEND)
2536 2540                          *recov = 1;
2537 2541                  else
2538 2542                          *recov = 0;
2539 2543  
2540 2544                  if (!ep->error)
2541 2545                          (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2542 2546                  return;
2543 2547          }
2544 2548  
2545 2549          if (res.status) {
2546 2550                  (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2547 2551                  return;
2548 2552          }
2549 2553  
2550 2554          mutex_enter(&rp->r_statev4_lock);
2551 2555          rp->created_v4 = 0;
2552 2556          mutex_exit(&rp->r_statev4_lock);
2553 2557  
2554 2558          resop = &res.array[2];
2555 2559          osp->open_stateid = resop->nfs_resop4_u.opclose.open_stateid;
2556 2560          osp->os_valid = 0;
2557 2561  
2558 2562          /*
2559 2563           * This removes the reference obtained at OPEN; ie, when the
2560 2564           * open stream structure was created.
2561 2565           *
2562 2566           * We don't have to worry about calling 'open_stream_rele'
2563 2567           * since we our currently holding a reference to the open
2564 2568           * stream which means the count cannot go to 0 with this
2565 2569           * decrement.
2566 2570           */
2567 2571          ASSERT(osp->os_ref_count >= 2);
2568 2572          osp->os_ref_count--;
2569 2573  
2570 2574          if (!ep->error)
2571 2575                  nfs4_attr_cache(vp,
2572 2576                      &res.array[1].nfs_resop4_u.opgetattr.ga_res,
2573 2577                      t, cred_otw, TRUE, NULL);
2574 2578  
2575 2579          NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4close_otw:"
2576 2580              " returning %d", ep->error));
2577 2581  
2578 2582          (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2579 2583  }
2580 2584  
2581 2585  /* ARGSUSED */
2582 2586  static int
2583 2587  nfs4_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
2584 2588      caller_context_t *ct)
2585 2589  {
2586 2590          rnode4_t *rp;
2587 2591          u_offset_t off;
2588 2592          offset_t diff;
2589 2593          uint_t on;
2590 2594          uint_t n;
2591 2595          caddr_t base;
2592 2596          uint_t flags;
2593 2597          int error;
2594 2598          mntinfo4_t *mi;
2595 2599  
2596 2600          rp = VTOR4(vp);
2597 2601  
2598 2602          ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
2599 2603  
2600 2604          if (IS_SHADOW(vp, rp))
2601 2605                  vp = RTOV4(rp);
2602 2606  
2603 2607          if (vp->v_type != VREG)
2604 2608                  return (EISDIR);
2605 2609  
2606 2610          mi = VTOMI4(vp);
2607 2611  
2608 2612          if (nfs_zone() != mi->mi_zone)
2609 2613                  return (EIO);
2610 2614  
2611 2615          if (uiop->uio_resid == 0)
2612 2616                  return (0);
2613 2617  
2614 2618          if (uiop->uio_loffset < 0 || uiop->uio_loffset + uiop->uio_resid < 0)
2615 2619                  return (EINVAL);
2616 2620  
2617 2621          mutex_enter(&rp->r_statelock);
2618 2622          if (rp->r_flags & R4RECOVERRP)
2619 2623                  error = (rp->r_error ? rp->r_error : EIO);
2620 2624          else
2621 2625                  error = 0;
2622 2626          mutex_exit(&rp->r_statelock);
2623 2627          if (error)
2624 2628                  return (error);
2625 2629  
2626 2630          /*
2627 2631           * Bypass VM if caching has been disabled (e.g., locking) or if
2628 2632           * using client-side direct I/O and the file is not mmap'd and
2629 2633           * there are no cached pages.
2630 2634           */
2631 2635          if ((vp->v_flag & VNOCACHE) ||
2632 2636              (((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) &&
2633 2637              rp->r_mapcnt == 0 && rp->r_inmap == 0 && !nfs4_has_pages(vp))) {
2634 2638                  size_t resid = 0;
2635 2639  
2636 2640                  return (nfs4read(vp, NULL, uiop->uio_loffset,
2637 2641                      uiop->uio_resid, &resid, cr, FALSE, uiop));
2638 2642          }
2639 2643  
2640 2644          error = 0;
2641 2645  
2642 2646          do {
2643 2647                  off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
2644 2648                  on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
2645 2649                  n = MIN(MAXBSIZE - on, uiop->uio_resid);
2646 2650  
2647 2651                  if (error = nfs4_validate_caches(vp, cr))
2648 2652                          break;
2649 2653  
2650 2654                  mutex_enter(&rp->r_statelock);
2651 2655                  while (rp->r_flags & R4INCACHEPURGE) {
2652 2656                          if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
2653 2657                                  mutex_exit(&rp->r_statelock);
2654 2658                                  return (EINTR);
2655 2659                          }
2656 2660                  }
2657 2661                  diff = rp->r_size - uiop->uio_loffset;
2658 2662                  mutex_exit(&rp->r_statelock);
2659 2663                  if (diff <= 0)
2660 2664                          break;
2661 2665                  if (diff < n)
2662 2666                          n = (uint_t)diff;
2663 2667  
2664 2668                  if (vpm_enable) {
2665 2669                          /*
2666 2670                           * Copy data.
2667 2671                           */
2668 2672                          error = vpm_data_copy(vp, off + on, n, uiop,
2669 2673                              1, NULL, 0, S_READ);
2670 2674                  } else {
2671 2675                          base = segmap_getmapflt(segkmap, vp, off + on, n, 1,
2672 2676                              S_READ);
2673 2677  
2674 2678                          error = uiomove(base + on, n, UIO_READ, uiop);
2675 2679                  }
2676 2680  
2677 2681                  if (!error) {
2678 2682                          /*
2679 2683                           * If read a whole block or read to eof,
2680 2684                           * won't need this buffer again soon.
2681 2685                           */
2682 2686                          mutex_enter(&rp->r_statelock);
2683 2687                          if (n + on == MAXBSIZE ||
2684 2688                              uiop->uio_loffset == rp->r_size)
2685 2689                                  flags = SM_DONTNEED;
2686 2690                          else
2687 2691                                  flags = 0;
2688 2692                          mutex_exit(&rp->r_statelock);
2689 2693                          if (vpm_enable) {
2690 2694                                  error = vpm_sync_pages(vp, off, n, flags);
2691 2695                          } else {
2692 2696                                  error = segmap_release(segkmap, base, flags);
2693 2697                          }
2694 2698                  } else {
2695 2699                          if (vpm_enable) {
2696 2700                                  (void) vpm_sync_pages(vp, off, n, 0);
2697 2701                          } else {
2698 2702                                  (void) segmap_release(segkmap, base, 0);
2699 2703                          }
2700 2704                  }
2701 2705          } while (!error && uiop->uio_resid > 0);
2702 2706  
2703 2707          return (error);
2704 2708  }
2705 2709  
2706 2710  /* ARGSUSED */
2707 2711  static int
2708 2712  nfs4_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
2709 2713      caller_context_t *ct)
2710 2714  {
2711 2715          rlim64_t limit = uiop->uio_llimit;
2712 2716          rnode4_t *rp;
2713 2717          u_offset_t off;
2714 2718          caddr_t base;
2715 2719          uint_t flags;
2716 2720          int remainder;
2717 2721          size_t n;
2718 2722          int on;
2719 2723          int error;
2720 2724          int resid;
2721 2725          u_offset_t offset;
2722 2726          mntinfo4_t *mi;
2723 2727          uint_t bsize;
2724 2728  
2725 2729          rp = VTOR4(vp);
2726 2730  
2727 2731          if (IS_SHADOW(vp, rp))
2728 2732                  vp = RTOV4(rp);
2729 2733  
2730 2734          if (vp->v_type != VREG)
2731 2735                  return (EISDIR);
2732 2736  
2733 2737          mi = VTOMI4(vp);
2734 2738  
2735 2739          if (nfs_zone() != mi->mi_zone)
2736 2740                  return (EIO);
2737 2741  
2738 2742          if (uiop->uio_resid == 0)
2739 2743                  return (0);
2740 2744  
2741 2745          mutex_enter(&rp->r_statelock);
2742 2746          if (rp->r_flags & R4RECOVERRP)
2743 2747                  error = (rp->r_error ? rp->r_error : EIO);
2744 2748          else
2745 2749                  error = 0;
2746 2750          mutex_exit(&rp->r_statelock);
2747 2751          if (error)
2748 2752                  return (error);
2749 2753  
2750 2754          if (ioflag & FAPPEND) {
2751 2755                  struct vattr va;
2752 2756  
2753 2757                  /*
2754 2758                   * Must serialize if appending.
2755 2759                   */
2756 2760                  if (nfs_rw_lock_held(&rp->r_rwlock, RW_READER)) {
2757 2761                          nfs_rw_exit(&rp->r_rwlock);
2758 2762                          if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER,
2759 2763                              INTR4(vp)))
2760 2764                                  return (EINTR);
2761 2765                  }
2762 2766  
2763 2767                  va.va_mask = AT_SIZE;
2764 2768                  error = nfs4getattr(vp, &va, cr);
2765 2769                  if (error)
2766 2770                          return (error);
2767 2771                  uiop->uio_loffset = va.va_size;
2768 2772          }
2769 2773  
2770 2774          offset = uiop->uio_loffset + uiop->uio_resid;
2771 2775  
2772 2776          if (uiop->uio_loffset < (offset_t)0 || offset < 0)
2773 2777                  return (EINVAL);
2774 2778  
2775 2779          if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
2776 2780                  limit = MAXOFFSET_T;
2777 2781  
2778 2782          /*
2779 2783           * Check to make sure that the process will not exceed
2780 2784           * its limit on file size.  It is okay to write up to
2781 2785           * the limit, but not beyond.  Thus, the write which
2782 2786           * reaches the limit will be short and the next write
2783 2787           * will return an error.
2784 2788           */
2785 2789          remainder = 0;
2786 2790          if (offset > uiop->uio_llimit) {
2787 2791                  remainder = offset - uiop->uio_llimit;
2788 2792                  uiop->uio_resid = uiop->uio_llimit - uiop->uio_loffset;
2789 2793                  if (uiop->uio_resid <= 0) {
2790 2794                          proc_t *p = ttoproc(curthread);
2791 2795  
2792 2796                          uiop->uio_resid += remainder;
2793 2797                          mutex_enter(&p->p_lock);
2794 2798                          (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
2795 2799                              p->p_rctls, p, RCA_UNSAFE_SIGINFO);
2796 2800                          mutex_exit(&p->p_lock);
2797 2801                          return (EFBIG);
2798 2802                  }
2799 2803          }
2800 2804  
2801 2805          /* update the change attribute, if we have a write delegation */
2802 2806  
2803 2807          mutex_enter(&rp->r_statev4_lock);
2804 2808          if (rp->r_deleg_type == OPEN_DELEGATE_WRITE)
2805 2809                  rp->r_deleg_change++;
2806 2810  
2807 2811          mutex_exit(&rp->r_statev4_lock);
2808 2812  
2809 2813          if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR4(vp)))
2810 2814                  return (EINTR);
2811 2815  
2812 2816          /*
2813 2817           * Bypass VM if caching has been disabled (e.g., locking) or if
2814 2818           * using client-side direct I/O and the file is not mmap'd and
2815 2819           * there are no cached pages.
2816 2820           */
2817 2821          if ((vp->v_flag & VNOCACHE) ||
2818 2822              (((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) &&
2819 2823              rp->r_mapcnt == 0 && rp->r_inmap == 0 && !nfs4_has_pages(vp))) {
2820 2824                  size_t bufsize;
2821 2825                  int count;
2822 2826                  u_offset_t org_offset;
2823 2827                  stable_how4 stab_comm;
2824 2828  nfs4_fwrite:
2825 2829                  if (rp->r_flags & R4STALE) {
2826 2830                          resid = uiop->uio_resid;
2827 2831                          offset = uiop->uio_loffset;
2828 2832                          error = rp->r_error;
2829 2833                          /*
2830 2834                           * A close may have cleared r_error, if so,
2831 2835                           * propagate ESTALE error return properly
2832 2836                           */
2833 2837                          if (error == 0)
2834 2838                                  error = ESTALE;
2835 2839                          goto bottom;
2836 2840                  }
2837 2841  
2838 2842                  bufsize = MIN(uiop->uio_resid, mi->mi_stsize);
2839 2843                  base = kmem_alloc(bufsize, KM_SLEEP);
2840 2844                  do {
2841 2845                          if (ioflag & FDSYNC)
2842 2846                                  stab_comm = DATA_SYNC4;
2843 2847                          else
2844 2848                                  stab_comm = FILE_SYNC4;
2845 2849                          resid = uiop->uio_resid;
2846 2850                          offset = uiop->uio_loffset;
2847 2851                          count = MIN(uiop->uio_resid, bufsize);
2848 2852                          org_offset = uiop->uio_loffset;
2849 2853                          error = uiomove(base, count, UIO_WRITE, uiop);
2850 2854                          if (!error) {
2851 2855                                  error = nfs4write(vp, base, org_offset,
2852 2856                                      count, cr, &stab_comm);
2853 2857                                  if (!error) {
2854 2858                                          mutex_enter(&rp->r_statelock);
2855 2859                                          if (rp->r_size < uiop->uio_loffset)
2856 2860                                                  rp->r_size = uiop->uio_loffset;
2857 2861                                          mutex_exit(&rp->r_statelock);
2858 2862                                  }
2859 2863                          }
2860 2864                  } while (!error && uiop->uio_resid > 0);
2861 2865                  kmem_free(base, bufsize);
2862 2866                  goto bottom;
2863 2867          }
2864 2868  
2865 2869          bsize = vp->v_vfsp->vfs_bsize;
2866 2870  
2867 2871          do {
2868 2872                  off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
2869 2873                  on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
2870 2874                  n = MIN(MAXBSIZE - on, uiop->uio_resid);
2871 2875  
2872 2876                  resid = uiop->uio_resid;
2873 2877                  offset = uiop->uio_loffset;
2874 2878  
2875 2879                  if (rp->r_flags & R4STALE) {
2876 2880                          error = rp->r_error;
2877 2881                          /*
2878 2882                           * A close may have cleared r_error, if so,
2879 2883                           * propagate ESTALE error return properly
2880 2884                           */
2881 2885                          if (error == 0)
2882 2886                                  error = ESTALE;
2883 2887                          break;
2884 2888                  }
2885 2889  
2886 2890                  /*
2887 2891                   * Don't create dirty pages faster than they
2888 2892                   * can be cleaned so that the system doesn't
2889 2893                   * get imbalanced.  If the async queue is
2890 2894                   * maxed out, then wait for it to drain before
2891 2895                   * creating more dirty pages.  Also, wait for
2892 2896                   * any threads doing pagewalks in the vop_getattr
2893 2897                   * entry points so that they don't block for
2894 2898                   * long periods.
2895 2899                   */
2896 2900                  mutex_enter(&rp->r_statelock);
2897 2901                  while ((mi->mi_max_threads != 0 &&
2898 2902                      rp->r_awcount > 2 * mi->mi_max_threads) ||
2899 2903                      rp->r_gcount > 0) {
2900 2904                          if (INTR4(vp)) {
2901 2905                                  klwp_t *lwp = ttolwp(curthread);
2902 2906  
2903 2907                                  if (lwp != NULL)
2904 2908                                          lwp->lwp_nostop++;
2905 2909                                  if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
2906 2910                                          mutex_exit(&rp->r_statelock);
2907 2911                                          if (lwp != NULL)
2908 2912                                                  lwp->lwp_nostop--;
2909 2913                                          error = EINTR;
2910 2914                                          goto bottom;
2911 2915                                  }
2912 2916                                  if (lwp != NULL)
2913 2917                                          lwp->lwp_nostop--;
2914 2918                          } else
2915 2919                                  cv_wait(&rp->r_cv, &rp->r_statelock);
2916 2920                  }
2917 2921                  mutex_exit(&rp->r_statelock);
2918 2922  
2919 2923                  /*
2920 2924                   * Touch the page and fault it in if it is not in core
2921 2925                   * before segmap_getmapflt or vpm_data_copy can lock it.
2922 2926                   * This is to avoid the deadlock if the buffer is mapped
2923 2927                   * to the same file through mmap which we want to write.
2924 2928                   */
2925 2929                  uio_prefaultpages((long)n, uiop);
2926 2930  
2927 2931                  if (vpm_enable) {
2928 2932                          /*
2929 2933                           * It will use kpm mappings, so no need to
2930 2934                           * pass an address.
2931 2935                           */
2932 2936                          error = writerp4(rp, NULL, n, uiop, 0);
2933 2937                  } else  {
2934 2938                          if (segmap_kpm) {
2935 2939                                  int pon = uiop->uio_loffset & PAGEOFFSET;
2936 2940                                  size_t pn = MIN(PAGESIZE - pon,
2937 2941                                      uiop->uio_resid);
2938 2942                                  int pagecreate;
2939 2943  
2940 2944                                  mutex_enter(&rp->r_statelock);
2941 2945                                  pagecreate = (pon == 0) && (pn == PAGESIZE ||
2942 2946                                      uiop->uio_loffset + pn >= rp->r_size);
2943 2947                                  mutex_exit(&rp->r_statelock);
2944 2948  
2945 2949                                  base = segmap_getmapflt(segkmap, vp, off + on,
2946 2950                                      pn, !pagecreate, S_WRITE);
2947 2951  
2948 2952                                  error = writerp4(rp, base + pon, n, uiop,
2949 2953                                      pagecreate);
2950 2954  
2951 2955                          } else {
2952 2956                                  base = segmap_getmapflt(segkmap, vp, off + on,
2953 2957                                      n, 0, S_READ);
2954 2958                                  error = writerp4(rp, base + on, n, uiop, 0);
2955 2959                          }
2956 2960                  }
2957 2961  
2958 2962                  if (!error) {
2959 2963                          if (mi->mi_flags & MI4_NOAC)
2960 2964                                  flags = SM_WRITE;
2961 2965                          else if ((uiop->uio_loffset % bsize) == 0 ||
2962 2966                              IS_SWAPVP(vp)) {
2963 2967                                  /*
2964 2968                                   * Have written a whole block.
2965 2969                                   * Start an asynchronous write
2966 2970                                   * and mark the buffer to
2967 2971                                   * indicate that it won't be
2968 2972                                   * needed again soon.
2969 2973                                   */
2970 2974                                  flags = SM_WRITE | SM_ASYNC | SM_DONTNEED;
2971 2975                          } else
2972 2976                                  flags = 0;
2973 2977                          if ((ioflag & (FSYNC|FDSYNC)) ||
2974 2978                              (rp->r_flags & R4OUTOFSPACE)) {
2975 2979                                  flags &= ~SM_ASYNC;
2976 2980                                  flags |= SM_WRITE;
2977 2981                          }
2978 2982                          if (vpm_enable) {
2979 2983                                  error = vpm_sync_pages(vp, off, n, flags);
2980 2984                          } else {
2981 2985                                  error = segmap_release(segkmap, base, flags);
2982 2986                          }
2983 2987                  } else {
2984 2988                          if (vpm_enable) {
2985 2989                                  (void) vpm_sync_pages(vp, off, n, 0);
2986 2990                          } else {
2987 2991                                  (void) segmap_release(segkmap, base, 0);
2988 2992                          }
2989 2993                          /*
2990 2994                           * In the event that we got an access error while
2991 2995                           * faulting in a page for a write-only file just
2992 2996                           * force a write.
2993 2997                           */
2994 2998                          if (error == EACCES)
2995 2999                                  goto nfs4_fwrite;
2996 3000                  }
2997 3001          } while (!error && uiop->uio_resid > 0);
2998 3002  
2999 3003  bottom:
3000 3004          if (error) {
3001 3005                  uiop->uio_resid = resid + remainder;
3002 3006                  uiop->uio_loffset = offset;
3003 3007          } else {
3004 3008                  uiop->uio_resid += remainder;
3005 3009  
3006 3010                  mutex_enter(&rp->r_statev4_lock);
3007 3011                  if (rp->r_deleg_type == OPEN_DELEGATE_WRITE) {
3008 3012                          gethrestime(&rp->r_attr.va_mtime);
3009 3013                          rp->r_attr.va_ctime = rp->r_attr.va_mtime;
3010 3014                  }
3011 3015                  mutex_exit(&rp->r_statev4_lock);
3012 3016          }
3013 3017  
3014 3018          nfs_rw_exit(&rp->r_lkserlock);
3015 3019  
3016 3020          return (error);
3017 3021  }
3018 3022  
3019 3023  /*
3020 3024   * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED}
3021 3025   */
3022 3026  static int
3023 3027  nfs4_rdwrlbn(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
3024 3028      int flags, cred_t *cr)
3025 3029  {
3026 3030          struct buf *bp;
3027 3031          int error;
3028 3032          page_t *savepp;
3029 3033          uchar_t fsdata;
3030 3034          stable_how4 stab_comm;
3031 3035  
3032 3036          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
3033 3037          bp = pageio_setup(pp, len, vp, flags);
3034 3038          ASSERT(bp != NULL);
3035 3039  
3036 3040          /*
3037 3041           * pageio_setup should have set b_addr to 0.  This
3038 3042           * is correct since we want to do I/O on a page
3039 3043           * boundary.  bp_mapin will use this addr to calculate
3040 3044           * an offset, and then set b_addr to the kernel virtual
3041 3045           * address it allocated for us.
3042 3046           */
3043 3047          ASSERT(bp->b_un.b_addr == 0);
3044 3048  
3045 3049          bp->b_edev = 0;
3046 3050          bp->b_dev = 0;
3047 3051          bp->b_lblkno = lbtodb(off);
3048 3052          bp->b_file = vp;
3049 3053          bp->b_offset = (offset_t)off;
3050 3054          bp_mapin(bp);
3051 3055  
3052 3056          if ((flags & (B_WRITE|B_ASYNC)) == (B_WRITE|B_ASYNC) &&
3053 3057              freemem > desfree)
3054 3058                  stab_comm = UNSTABLE4;
3055 3059          else
3056 3060                  stab_comm = FILE_SYNC4;
3057 3061  
3058 3062          error = nfs4_bio(bp, &stab_comm, cr, FALSE);
3059 3063  
3060 3064          bp_mapout(bp);
3061 3065          pageio_done(bp);
3062 3066  
3063 3067          if (stab_comm == UNSTABLE4)
3064 3068                  fsdata = C_DELAYCOMMIT;
3065 3069          else
3066 3070                  fsdata = C_NOCOMMIT;
3067 3071  
3068 3072          savepp = pp;
3069 3073          do {
3070 3074                  pp->p_fsdata = fsdata;
3071 3075          } while ((pp = pp->p_next) != savepp);
3072 3076  
3073 3077          return (error);
3074 3078  }
3075 3079  
3076 3080  /*
3077 3081   */
3078 3082  static int
3079 3083  nfs4rdwr_check_osid(vnode_t *vp, nfs4_error_t *ep, cred_t *cr)
3080 3084  {
3081 3085          nfs4_open_owner_t       *oop;
3082 3086          nfs4_open_stream_t      *osp;
3083 3087          rnode4_t                *rp = VTOR4(vp);
3084 3088          mntinfo4_t              *mi = VTOMI4(vp);
3085 3089          int                     reopen_needed;
3086 3090  
3087 3091          ASSERT(nfs_zone() == mi->mi_zone);
3088 3092  
3089 3093  
3090 3094          oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
3091 3095          if (!oop)
3092 3096                  return (EIO);
3093 3097  
3094 3098          /* returns with 'os_sync_lock' held */
3095 3099          osp = find_open_stream(oop, rp);
3096 3100          if (!osp) {
3097 3101                  open_owner_rele(oop);
3098 3102                  return (EIO);
3099 3103          }
3100 3104  
3101 3105          if (osp->os_failed_reopen) {
3102 3106                  mutex_exit(&osp->os_sync_lock);
3103 3107                  open_stream_rele(osp, rp);
3104 3108                  open_owner_rele(oop);
3105 3109                  return (EIO);
3106 3110          }
3107 3111  
3108 3112          /*
3109 3113           * Determine whether a reopen is needed.  If this
3110 3114           * is a delegation open stream, then the os_delegation bit
3111 3115           * should be set.
3112 3116           */
3113 3117  
3114 3118          reopen_needed = osp->os_delegation;
3115 3119  
3116 3120          mutex_exit(&osp->os_sync_lock);
3117 3121          open_owner_rele(oop);
3118 3122  
3119 3123          if (reopen_needed) {
3120 3124                  nfs4_error_zinit(ep);
3121 3125                  nfs4_reopen(vp, osp, ep, CLAIM_NULL, FALSE, FALSE);
3122 3126                  mutex_enter(&osp->os_sync_lock);
3123 3127                  if (ep->error || ep->stat || osp->os_failed_reopen) {
3124 3128                          mutex_exit(&osp->os_sync_lock);
3125 3129                          open_stream_rele(osp, rp);
3126 3130                          return (EIO);
3127 3131                  }
3128 3132                  mutex_exit(&osp->os_sync_lock);
3129 3133          }
3130 3134          open_stream_rele(osp, rp);
3131 3135  
3132 3136          return (0);
3133 3137  }
3134 3138  
3135 3139  /*
3136 3140   * Write to file.  Writes to remote server in largest size
3137 3141   * chunks that the server can handle.  Write is synchronous.
3138 3142   */
3139 3143  static int
3140 3144  nfs4write(vnode_t *vp, caddr_t base, u_offset_t offset, int count, cred_t *cr,
3141 3145      stable_how4 *stab_comm)
3142 3146  {
3143 3147          mntinfo4_t *mi;
3144 3148          COMPOUND4args_clnt args;
3145 3149          COMPOUND4res_clnt res;
3146 3150          WRITE4args *wargs;
3147 3151          WRITE4res *wres;
3148 3152          nfs_argop4 argop[2];
3149 3153          nfs_resop4 *resop;
3150 3154          int tsize;
3151 3155          stable_how4 stable;
3152 3156          rnode4_t *rp;
3153 3157          int doqueue = 1;
3154 3158          bool_t needrecov;
3155 3159          nfs4_recov_state_t recov_state;
3156 3160          nfs4_stateid_types_t sid_types;
3157 3161          nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
3158 3162          int recov;
3159 3163  
3160 3164          rp = VTOR4(vp);
3161 3165          mi = VTOMI4(vp);
3162 3166  
3163 3167          ASSERT(nfs_zone() == mi->mi_zone);
3164 3168  
3165 3169          stable = *stab_comm;
3166 3170          *stab_comm = FILE_SYNC4;
3167 3171  
3168 3172          needrecov = FALSE;
3169 3173          recov_state.rs_flags = 0;
3170 3174          recov_state.rs_num_retry_despite_err = 0;
3171 3175          nfs4_init_stateid_types(&sid_types);
3172 3176  
3173 3177          /* Is curthread the recovery thread? */
3174 3178          mutex_enter(&mi->mi_lock);
3175 3179          recov = (mi->mi_recovthread == curthread);
3176 3180          mutex_exit(&mi->mi_lock);
3177 3181  
3178 3182  recov_retry:
3179 3183          args.ctag = TAG_WRITE;
3180 3184          args.array_len = 2;
3181 3185          args.array = argop;
3182 3186  
3183 3187          if (!recov) {
3184 3188                  e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3185 3189                      &recov_state, NULL);
3186 3190                  if (e.error)
3187 3191                          return (e.error);
3188 3192          }
3189 3193  
3190 3194          /* 0. putfh target fh */
3191 3195          argop[0].argop = OP_CPUTFH;
3192 3196          argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
3193 3197  
3194 3198          /* 1. write */
3195 3199          nfs4args_write(&argop[1], stable, rp, cr, &wargs, &sid_types);
3196 3200  
3197 3201          do {
3198 3202  
3199 3203                  wargs->offset = (offset4)offset;
3200 3204                  wargs->data_val = base;
3201 3205  
3202 3206                  if (mi->mi_io_kstats) {
3203 3207                          mutex_enter(&mi->mi_lock);
3204 3208                          kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
3205 3209                          mutex_exit(&mi->mi_lock);
3206 3210                  }
3207 3211  
3208 3212                  if ((vp->v_flag & VNOCACHE) ||
3209 3213                      (rp->r_flags & R4DIRECTIO) ||
3210 3214                      (mi->mi_flags & MI4_DIRECTIO))
3211 3215                          tsize = MIN(mi->mi_stsize, count);
3212 3216                  else
3213 3217                          tsize = MIN(mi->mi_curwrite, count);
3214 3218                  wargs->data_len = (uint_t)tsize;
3215 3219                  rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
3216 3220  
3217 3221                  if (mi->mi_io_kstats) {
3218 3222                          mutex_enter(&mi->mi_lock);
3219 3223                          kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
3220 3224                          mutex_exit(&mi->mi_lock);
3221 3225                  }
3222 3226  
3223 3227                  if (!recov) {
3224 3228                          needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
3225 3229                          if (e.error && !needrecov) {
3226 3230                                  nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3227 3231                                      &recov_state, needrecov);
3228 3232                                  return (e.error);
3229 3233                          }
3230 3234                  } else {
3231 3235                          if (e.error)
3232 3236                                  return (e.error);
3233 3237                  }
3234 3238  
3235 3239                  /*
3236 3240                   * Do handling of OLD_STATEID outside
3237 3241                   * of the normal recovery framework.
3238 3242                   *
3239 3243                   * If write receives a BAD stateid error while using a
3240 3244                   * delegation stateid, retry using the open stateid (if it
3241 3245                   * exists).  If it doesn't have an open stateid, reopen the
3242 3246                   * file first, then retry.
3243 3247                   */
3244 3248                  if (!e.error && res.status == NFS4ERR_OLD_STATEID &&
3245 3249                      sid_types.cur_sid_type != SPEC_SID) {
3246 3250                          nfs4_save_stateid(&wargs->stateid, &sid_types);
3247 3251                          if (!recov)
3248 3252                                  nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3249 3253                                      &recov_state, needrecov);
3250 3254                          (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3251 3255                          goto recov_retry;
3252 3256                  } else if (e.error == 0 && res.status == NFS4ERR_BAD_STATEID &&
3253 3257                      sid_types.cur_sid_type == DEL_SID) {
3254 3258                          nfs4_save_stateid(&wargs->stateid, &sid_types);
3255 3259                          mutex_enter(&rp->r_statev4_lock);
3256 3260                          rp->r_deleg_return_pending = TRUE;
3257 3261                          mutex_exit(&rp->r_statev4_lock);
3258 3262                          if (nfs4rdwr_check_osid(vp, &e, cr)) {
3259 3263                                  if (!recov)
3260 3264                                          nfs4_end_fop(mi, vp, NULL, OH_WRITE,
3261 3265                                              &recov_state, needrecov);
3262 3266                                  (void) xdr_free(xdr_COMPOUND4res_clnt,
3263 3267                                      (caddr_t)&res);
3264 3268                                  return (EIO);
3265 3269                          }
3266 3270                          if (!recov)
3267 3271                                  nfs4_end_fop(mi, vp, NULL, OH_WRITE,
3268 3272                                      &recov_state, needrecov);
3269 3273                          /* hold needed for nfs4delegreturn_thread */
3270 3274                          VN_HOLD(vp);
3271 3275                          nfs4delegreturn_async(rp, (NFS4_DR_PUSH|NFS4_DR_REOPEN|
3272 3276                              NFS4_DR_DISCARD), FALSE);
3273 3277                          (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3274 3278                          goto recov_retry;
3275 3279                  }
3276 3280  
3277 3281                  if (needrecov) {
3278 3282                          bool_t abort;
3279 3283  
3280 3284                          NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
3281 3285                              "nfs4write: client got error %d, res.status %d"
3282 3286                              ", so start recovery", e.error, res.status));
3283 3287  
3284 3288                          abort = nfs4_start_recovery(&e,
3285 3289                              VTOMI4(vp), vp, NULL, &wargs->stateid,
3286 3290                              NULL, OP_WRITE, NULL, NULL, NULL);
3287 3291                          if (!e.error) {
3288 3292                                  e.error = geterrno4(res.status);
3289 3293                                  (void) xdr_free(xdr_COMPOUND4res_clnt,
3290 3294                                      (caddr_t)&res);
3291 3295                          }
3292 3296                          nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3293 3297                              &recov_state, needrecov);
3294 3298                          if (abort == FALSE)
3295 3299                                  goto recov_retry;
3296 3300                          return (e.error);
3297 3301                  }
3298 3302  
3299 3303                  if (res.status) {
3300 3304                          e.error = geterrno4(res.status);
3301 3305                          (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3302 3306                          if (!recov)
3303 3307                                  nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3304 3308                                      &recov_state, needrecov);
3305 3309                          return (e.error);
3306 3310                  }
3307 3311  
3308 3312                  resop = &res.array[1];  /* write res */
3309 3313                  wres = &resop->nfs_resop4_u.opwrite;
3310 3314  
3311 3315                  if ((int)wres->count > tsize) {
3312 3316                          (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3313 3317  
3314 3318                          zcmn_err(getzoneid(), CE_WARN,
3315 3319                              "nfs4write: server wrote %u, requested was %u",
3316 3320                              (int)wres->count, tsize);
3317 3321                          if (!recov)
3318 3322                                  nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3319 3323                                      &recov_state, needrecov);
3320 3324                          return (EIO);
3321 3325                  }
3322 3326                  if (wres->committed == UNSTABLE4) {
3323 3327                          *stab_comm = UNSTABLE4;
3324 3328                          if (wargs->stable == DATA_SYNC4 ||
3325 3329                              wargs->stable == FILE_SYNC4) {
3326 3330                                  (void) xdr_free(xdr_COMPOUND4res_clnt,
3327 3331                                      (caddr_t)&res);
3328 3332                                  zcmn_err(getzoneid(), CE_WARN,
3329 3333                                      "nfs4write: server %s did not commit "
3330 3334                                      "to stable storage",
3331 3335                                      rp->r_server->sv_hostname);
3332 3336                                  if (!recov)
3333 3337                                          nfs4_end_fop(VTOMI4(vp), vp, NULL,
3334 3338                                              OH_WRITE, &recov_state, needrecov);
3335 3339                                  return (EIO);
3336 3340                          }
3337 3341                  }
3338 3342  
3339 3343                  tsize = (int)wres->count;
3340 3344                  count -= tsize;
3341 3345                  base += tsize;
3342 3346                  offset += tsize;
3343 3347                  if (mi->mi_io_kstats) {
3344 3348                          mutex_enter(&mi->mi_lock);
3345 3349                          KSTAT_IO_PTR(mi->mi_io_kstats)->writes++;
3346 3350                          KSTAT_IO_PTR(mi->mi_io_kstats)->nwritten +=
3347 3351                              tsize;
3348 3352                          mutex_exit(&mi->mi_lock);
3349 3353                  }
3350 3354                  lwp_stat_update(LWP_STAT_OUBLK, 1);
3351 3355                  mutex_enter(&rp->r_statelock);
3352 3356                  if (rp->r_flags & R4HAVEVERF) {
3353 3357                          if (rp->r_writeverf != wres->writeverf) {
3354 3358                                  nfs4_set_mod(vp);
3355 3359                                  rp->r_writeverf = wres->writeverf;
3356 3360                          }
3357 3361                  } else {
3358 3362                          rp->r_writeverf = wres->writeverf;
3359 3363                          rp->r_flags |= R4HAVEVERF;
3360 3364                  }
3361 3365                  PURGE_ATTRCACHE4_LOCKED(rp);
3362 3366                  rp->r_flags |= R4WRITEMODIFIED;
3363 3367                  gethrestime(&rp->r_attr.va_mtime);
3364 3368                  rp->r_attr.va_ctime = rp->r_attr.va_mtime;
3365 3369                  mutex_exit(&rp->r_statelock);
3366 3370                  (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3367 3371          } while (count);
3368 3372  
3369 3373          if (!recov)
3370 3374                  nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, &recov_state,
3371 3375                      needrecov);
3372 3376  
3373 3377          return (e.error);
3374 3378  }
3375 3379  
3376 3380  /*
3377 3381   * Read from a file.  Reads data in largest chunks our interface can handle.
3378 3382   */
3379 3383  static int
3380 3384  nfs4read(vnode_t *vp, caddr_t base, offset_t offset, int count,
3381 3385      size_t *residp, cred_t *cr, bool_t async, struct uio *uiop)
3382 3386  {
3383 3387          mntinfo4_t *mi;
3384 3388          COMPOUND4args_clnt args;
3385 3389          COMPOUND4res_clnt res;
3386 3390          READ4args *rargs;
3387 3391          nfs_argop4 argop[2];
3388 3392          int tsize;
3389 3393          int doqueue;
3390 3394          rnode4_t *rp;
3391 3395          int data_len;
3392 3396          bool_t is_eof;
3393 3397          bool_t needrecov = FALSE;
3394 3398          nfs4_recov_state_t recov_state;
3395 3399          nfs4_stateid_types_t sid_types;
3396 3400          nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
3397 3401  
3398 3402          rp = VTOR4(vp);
3399 3403          mi = VTOMI4(vp);
3400 3404          doqueue = 1;
3401 3405  
3402 3406          ASSERT(nfs_zone() == mi->mi_zone);
3403 3407  
3404 3408          args.ctag = async ? TAG_READAHEAD : TAG_READ;
3405 3409  
3406 3410          args.array_len = 2;
3407 3411          args.array = argop;
3408 3412  
3409 3413          nfs4_init_stateid_types(&sid_types);
3410 3414  
3411 3415          recov_state.rs_flags = 0;
3412 3416          recov_state.rs_num_retry_despite_err = 0;
3413 3417  
3414 3418  recov_retry:
3415 3419          e.error = nfs4_start_fop(mi, vp, NULL, OH_READ,
3416 3420              &recov_state, NULL);
3417 3421          if (e.error)
3418 3422                  return (e.error);
3419 3423  
3420 3424          /* putfh target fh */
3421 3425          argop[0].argop = OP_CPUTFH;
3422 3426          argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
3423 3427  
3424 3428          /* read */
3425 3429          argop[1].argop = OP_READ;
3426 3430          rargs = &argop[1].nfs_argop4_u.opread;
3427 3431          rargs->stateid = nfs4_get_stateid(cr, rp, curproc->p_pidp->pid_id, mi,
3428 3432              OP_READ, &sid_types, async);
3429 3433  
3430 3434          do {
3431 3435                  if (mi->mi_io_kstats) {
3432 3436                          mutex_enter(&mi->mi_lock);
3433 3437                          kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
3434 3438                          mutex_exit(&mi->mi_lock);
3435 3439                  }
3436 3440  
3437 3441                  NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
3438 3442                      "nfs4read: %s call, rp %s",
3439 3443                      needrecov ? "recov" : "first",
3440 3444                      rnode4info(rp)));
3441 3445  
3442 3446                  if ((vp->v_flag & VNOCACHE) ||
3443 3447                      (rp->r_flags & R4DIRECTIO) ||
3444 3448                      (mi->mi_flags & MI4_DIRECTIO))
3445 3449                          tsize = MIN(mi->mi_tsize, count);
3446 3450                  else
3447 3451                          tsize = MIN(mi->mi_curread, count);
3448 3452  
3449 3453                  rargs->offset = (offset4)offset;
3450 3454                  rargs->count = (count4)tsize;
3451 3455                  rargs->res_data_val_alt = NULL;
3452 3456                  rargs->res_mblk = NULL;
3453 3457                  rargs->res_uiop = NULL;
3454 3458                  rargs->res_maxsize = 0;
3455 3459                  rargs->wlist = NULL;
3456 3460  
3457 3461                  if (uiop)
3458 3462                          rargs->res_uiop = uiop;
3459 3463                  else
3460 3464                          rargs->res_data_val_alt = base;
3461 3465                  rargs->res_maxsize = tsize;
3462 3466  
3463 3467                  rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
3464 3468  #ifdef  DEBUG
3465 3469                  if (nfs4read_error_inject) {
3466 3470                          res.status = nfs4read_error_inject;
3467 3471                          nfs4read_error_inject = 0;
3468 3472                  }
3469 3473  #endif
3470 3474  
3471 3475                  if (mi->mi_io_kstats) {
3472 3476                          mutex_enter(&mi->mi_lock);
3473 3477                          kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
3474 3478                          mutex_exit(&mi->mi_lock);
3475 3479                  }
3476 3480  
3477 3481                  needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
3478 3482                  if (e.error != 0 && !needrecov) {
3479 3483                          nfs4_end_fop(mi, vp, NULL, OH_READ,
3480 3484                              &recov_state, needrecov);
3481 3485                          return (e.error);
3482 3486                  }
3483 3487  
3484 3488                  /*
3485 3489                   * Do proper retry for OLD and BAD stateid errors outside
3486 3490                   * of the normal recovery framework.  There are two differences
3487 3491                   * between async and sync reads.  The first is that we allow
3488 3492                   * retry on BAD_STATEID for async reads, but not sync reads.
3489 3493                   * The second is that we mark the file dead for a failed
3490 3494                   * attempt with a special stateid for sync reads, but just
3491 3495                   * return EIO for async reads.
3492 3496                   *
3493 3497                   * If a sync read receives a BAD stateid error while using a
3494 3498                   * delegation stateid, retry using the open stateid (if it
3495 3499                   * exists).  If it doesn't have an open stateid, reopen the
3496 3500                   * file first, then retry.
3497 3501                   */
3498 3502                  if (e.error == 0 && (res.status == NFS4ERR_OLD_STATEID ||
3499 3503                      res.status == NFS4ERR_BAD_STATEID) && async) {
3500 3504                          nfs4_end_fop(mi, vp, NULL, OH_READ,
3501 3505                              &recov_state, needrecov);
3502 3506                          if (sid_types.cur_sid_type == SPEC_SID) {
3503 3507                                  (void) xdr_free(xdr_COMPOUND4res_clnt,
3504 3508                                      (caddr_t)&res);
3505 3509                                  return (EIO);
3506 3510                          }
3507 3511                          nfs4_save_stateid(&rargs->stateid, &sid_types);
3508 3512                          (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3509 3513                          goto recov_retry;
3510 3514                  } else if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID &&
3511 3515                      !async && sid_types.cur_sid_type != SPEC_SID) {
3512 3516                          nfs4_save_stateid(&rargs->stateid, &sid_types);
3513 3517                          nfs4_end_fop(mi, vp, NULL, OH_READ,
3514 3518                              &recov_state, needrecov);
3515 3519                          (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3516 3520                          goto recov_retry;
3517 3521                  } else if (e.error == 0 && res.status == NFS4ERR_BAD_STATEID &&
3518 3522                      sid_types.cur_sid_type == DEL_SID) {
3519 3523                          nfs4_save_stateid(&rargs->stateid, &sid_types);
3520 3524                          mutex_enter(&rp->r_statev4_lock);
3521 3525                          rp->r_deleg_return_pending = TRUE;
3522 3526                          mutex_exit(&rp->r_statev4_lock);
3523 3527                          if (nfs4rdwr_check_osid(vp, &e, cr)) {
3524 3528                                  nfs4_end_fop(mi, vp, NULL, OH_READ,
3525 3529                                      &recov_state, needrecov);
3526 3530                                  (void) xdr_free(xdr_COMPOUND4res_clnt,
3527 3531                                      (caddr_t)&res);
3528 3532                                  return (EIO);
3529 3533                          }
3530 3534                          nfs4_end_fop(mi, vp, NULL, OH_READ,
3531 3535                              &recov_state, needrecov);
3532 3536                          /* hold needed for nfs4delegreturn_thread */
3533 3537                          VN_HOLD(vp);
3534 3538                          nfs4delegreturn_async(rp, (NFS4_DR_PUSH|NFS4_DR_REOPEN|
3535 3539                              NFS4_DR_DISCARD), FALSE);
3536 3540                          (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3537 3541                          goto recov_retry;
3538 3542                  }
3539 3543                  if (needrecov) {
3540 3544                          bool_t abort;
3541 3545  
3542 3546                          NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
3543 3547                              "nfs4read: initiating recovery\n"));
3544 3548                          abort = nfs4_start_recovery(&e,
3545 3549                              mi, vp, NULL, &rargs->stateid,
3546 3550                              NULL, OP_READ, NULL, NULL, NULL);
3547 3551                          nfs4_end_fop(mi, vp, NULL, OH_READ,
3548 3552                              &recov_state, needrecov);
3549 3553                          /*
3550 3554                           * Do not retry if we got OLD_STATEID using a special
3551 3555                           * stateid.  This avoids looping with a broken server.
3552 3556                           */
3553 3557                          if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID &&
3554 3558                              sid_types.cur_sid_type == SPEC_SID)
3555 3559                                  abort = TRUE;
3556 3560  
3557 3561                          if (abort == FALSE) {
3558 3562                                  /*
3559 3563                                   * Need to retry all possible stateids in
3560 3564                                   * case the recovery error wasn't stateid
3561 3565                                   * related or the stateids have become
3562 3566                                   * stale (server reboot).
3563 3567                                   */
3564 3568                                  nfs4_init_stateid_types(&sid_types);
3565 3569                                  (void) xdr_free(xdr_COMPOUND4res_clnt,
3566 3570                                      (caddr_t)&res);
3567 3571                                  goto recov_retry;
3568 3572                          }
3569 3573  
3570 3574                          if (!e.error) {
3571 3575                                  e.error = geterrno4(res.status);
3572 3576                                  (void) xdr_free(xdr_COMPOUND4res_clnt,
3573 3577                                      (caddr_t)&res);
3574 3578                          }
3575 3579                          return (e.error);
3576 3580                  }
3577 3581  
3578 3582                  if (res.status) {
3579 3583                          e.error = geterrno4(res.status);
3580 3584                          nfs4_end_fop(mi, vp, NULL, OH_READ,
3581 3585                              &recov_state, needrecov);
3582 3586                          (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3583 3587                          return (e.error);
3584 3588                  }
3585 3589  
3586 3590                  data_len = res.array[1].nfs_resop4_u.opread.data_len;
3587 3591                  count -= data_len;
3588 3592                  if (base)
3589 3593                          base += data_len;
3590 3594                  offset += data_len;
3591 3595                  if (mi->mi_io_kstats) {
3592 3596                          mutex_enter(&mi->mi_lock);
3593 3597                          KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
3594 3598                          KSTAT_IO_PTR(mi->mi_io_kstats)->nread += data_len;
3595 3599                          mutex_exit(&mi->mi_lock);
3596 3600                  }
3597 3601                  lwp_stat_update(LWP_STAT_INBLK, 1);
3598 3602                  is_eof = res.array[1].nfs_resop4_u.opread.eof;
3599 3603                  (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3600 3604  
3601 3605          } while (count && !is_eof);
3602 3606  
3603 3607          *residp = count;
3604 3608  
3605 3609          nfs4_end_fop(mi, vp, NULL, OH_READ, &recov_state, needrecov);
3606 3610  
3607 3611          return (e.error);
3608 3612  }
3609 3613  
3610 3614  /* ARGSUSED */
3611 3615  static int
3612 3616  nfs4_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp,
3613 3617          caller_context_t *ct)
3614 3618  {
3615 3619          if (nfs_zone() != VTOMI4(vp)->mi_zone)
3616 3620                  return (EIO);
3617 3621          switch (cmd) {
3618 3622                  case _FIODIRECTIO:
3619 3623                          return (nfs4_directio(vp, (int)arg, cr));
3620 3624                  default:
3621 3625                          return (ENOTTY);
3622 3626          }
3623 3627  }
3624 3628  
3625 3629  /* ARGSUSED */
3626 3630  int
3627 3631  nfs4_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
3628 3632      caller_context_t *ct)
3629 3633  {
3630 3634          int error;
3631 3635          rnode4_t *rp = VTOR4(vp);
3632 3636  
3633 3637          if (nfs_zone() != VTOMI4(vp)->mi_zone)
3634 3638                  return (EIO);
3635 3639          /*
3636 3640           * If it has been specified that the return value will
3637 3641           * just be used as a hint, and we are only being asked
3638 3642           * for size, fsid or rdevid, then return the client's
3639 3643           * notion of these values without checking to make sure
3640 3644           * that the attribute cache is up to date.
3641 3645           * The whole point is to avoid an over the wire GETATTR
3642 3646           * call.
3643 3647           */
3644 3648          if (flags & ATTR_HINT) {
3645 3649                  if (!(vap->va_mask & ~(AT_SIZE | AT_FSID | AT_RDEV))) {
3646 3650                          mutex_enter(&rp->r_statelock);
3647 3651                          if (vap->va_mask & AT_SIZE)
3648 3652                                  vap->va_size = rp->r_size;
3649 3653                          if (vap->va_mask & AT_FSID)
3650 3654                                  vap->va_fsid = rp->r_attr.va_fsid;
3651 3655                          if (vap->va_mask & AT_RDEV)
3652 3656                                  vap->va_rdev = rp->r_attr.va_rdev;
3653 3657                          mutex_exit(&rp->r_statelock);
3654 3658                          return (0);
3655 3659                  }
3656 3660          }
3657 3661  
3658 3662          /*
3659 3663           * Only need to flush pages if asking for the mtime
3660 3664           * and if there any dirty pages or any outstanding
3661 3665           * asynchronous (write) requests for this file.
3662 3666           */
3663 3667          if (vap->va_mask & AT_MTIME) {
3664 3668                  rp = VTOR4(vp);
3665 3669                  if (nfs4_has_pages(vp)) {
3666 3670                          mutex_enter(&rp->r_statev4_lock);
3667 3671                          if (rp->r_deleg_type != OPEN_DELEGATE_WRITE) {
3668 3672                                  mutex_exit(&rp->r_statev4_lock);
3669 3673                                  if (rp->r_flags & R4DIRTY ||
3670 3674                                      rp->r_awcount > 0) {
3671 3675                                          mutex_enter(&rp->r_statelock);
3672 3676                                          rp->r_gcount++;
3673 3677                                          mutex_exit(&rp->r_statelock);
3674 3678                                          error =
3675 3679                                              nfs4_putpage(vp, (u_offset_t)0,
3676 3680                                              0, 0, cr, NULL);
3677 3681                                          mutex_enter(&rp->r_statelock);
3678 3682                                          if (error && (error == ENOSPC ||
3679 3683                                              error == EDQUOT)) {
3680 3684                                                  if (!rp->r_error)
3681 3685                                                          rp->r_error = error;
3682 3686                                          }
3683 3687                                          if (--rp->r_gcount == 0)
3684 3688                                                  cv_broadcast(&rp->r_cv);
3685 3689                                          mutex_exit(&rp->r_statelock);
3686 3690                                  }
3687 3691                          } else {
3688 3692                                  mutex_exit(&rp->r_statev4_lock);
3689 3693                          }
3690 3694                  }
3691 3695          }
3692 3696          return (nfs4getattr(vp, vap, cr));
3693 3697  }
3694 3698  
3695 3699  int
3696 3700  nfs4_compare_modes(mode_t from_server, mode_t on_client)
3697 3701  {
3698 3702          /*
3699 3703           * If these are the only two bits cleared
3700 3704           * on the server then return 0 (OK) else
3701 3705           * return 1 (BAD).
3702 3706           */
3703 3707          on_client &= ~(S_ISUID|S_ISGID);
3704 3708          if (on_client == from_server)
3705 3709                  return (0);
3706 3710          else
3707 3711                  return (1);
3708 3712  }
3709 3713  
3710 3714  /*ARGSUSED4*/
3711 3715  static int
3712 3716  nfs4_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
3713 3717      caller_context_t *ct)
3714 3718  {
3715 3719          if (vap->va_mask & AT_NOSET)
3716 3720                  return (EINVAL);
3717 3721  
3718 3722          if (nfs_zone() != VTOMI4(vp)->mi_zone)
3719 3723                  return (EIO);
3720 3724  
3721 3725          /*
3722 3726           * Don't call secpolicy_vnode_setattr, the client cannot
3723 3727           * use its cached attributes to make security decisions
3724 3728           * as the server may be faking mode bits or mapping uid/gid.
3725 3729           * Always just let the server to the checking.
3726 3730           * If we provide the ability to remove basic priviledges
3727 3731           * to setattr (e.g. basic without chmod) then we will
3728 3732           * need to add a check here before calling the server.
3729 3733           */
3730 3734  
3731 3735          return (nfs4setattr(vp, vap, flags, cr, NULL));
3732 3736  }
3733 3737  
3734 3738  /*
3735 3739   * To replace the "guarded" version 3 setattr, we use two types of compound
3736 3740   * setattr requests:
3737 3741   * 1. The "normal" setattr, used when the size of the file isn't being
3738 3742   *    changed - { Putfh <fh>; Setattr; Getattr }/
3739 3743   * 2. If the size is changed, precede Setattr with: Getattr; Verify
3740 3744   *    with only ctime as the argument. If the server ctime differs from
3741 3745   *    what is cached on the client, the verify will fail, but we would
3742 3746   *    already have the ctime from the preceding getattr, so just set it
3743 3747   *    and retry. Thus the compound here is - { Putfh <fh>; Getattr; Verify;
3744 3748   *      Setattr; Getattr }.
3745 3749   *
3746 3750   * The vsecattr_t * input parameter will be non-NULL if ACLs are being set in
3747 3751   * this setattr and NULL if they are not.
3748 3752   */
3749 3753  static int
3750 3754  nfs4setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
3751 3755      vsecattr_t *vsap)
3752 3756  {
3753 3757          COMPOUND4args_clnt args;
3754 3758          COMPOUND4res_clnt res, *resp = NULL;
3755 3759          nfs4_ga_res_t *garp = NULL;
3756 3760          int numops = 3;                 /* { Putfh; Setattr; Getattr } */
3757 3761          nfs_argop4 argop[5];
3758 3762          int verify_argop = -1;
3759 3763          int setattr_argop = 1;
3760 3764          nfs_resop4 *resop;
3761 3765          vattr_t va;
3762 3766          rnode4_t *rp;
3763 3767          int doqueue = 1;
3764 3768          uint_t mask = vap->va_mask;
3765 3769          mode_t omode;
3766 3770          vsecattr_t *vsp;
3767 3771          timestruc_t ctime;
3768 3772          bool_t needrecov = FALSE;
3769 3773          nfs4_recov_state_t recov_state;
3770 3774          nfs4_stateid_types_t sid_types;
3771 3775          stateid4 stateid;
3772 3776          hrtime_t t;
3773 3777          nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
3774 3778          servinfo4_t *svp;
3775 3779          bitmap4 supp_attrs;
3776 3780  
3777 3781          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
3778 3782          rp = VTOR4(vp);
3779 3783          nfs4_init_stateid_types(&sid_types);
3780 3784  
3781 3785          /*
3782 3786           * Only need to flush pages if there are any pages and
3783 3787           * if the file is marked as dirty in some fashion.  The
3784 3788           * file must be flushed so that we can accurately
3785 3789           * determine the size of the file and the cached data
3786 3790           * after the SETATTR returns.  A file is considered to
3787 3791           * be dirty if it is either marked with R4DIRTY, has
3788 3792           * outstanding i/o's active, or is mmap'd.  In this
3789 3793           * last case, we can't tell whether there are dirty
3790 3794           * pages, so we flush just to be sure.
3791 3795           */
3792 3796          if (nfs4_has_pages(vp) &&
3793 3797              ((rp->r_flags & R4DIRTY) ||
3794 3798              rp->r_count > 0 ||
3795 3799              rp->r_mapcnt > 0)) {
3796 3800                  ASSERT(vp->v_type != VCHR);
3797 3801                  e.error = nfs4_putpage(vp, (offset_t)0, 0, 0, cr, NULL);
3798 3802                  if (e.error && (e.error == ENOSPC || e.error == EDQUOT)) {
3799 3803                          mutex_enter(&rp->r_statelock);
3800 3804                          if (!rp->r_error)
3801 3805                                  rp->r_error = e.error;
3802 3806                          mutex_exit(&rp->r_statelock);
3803 3807                  }
3804 3808          }
3805 3809  
3806 3810          if (mask & AT_SIZE) {
3807 3811                  /*
3808 3812                   * Verification setattr compound for non-deleg AT_SIZE:
3809 3813                   *      { Putfh; Getattr; Verify; Setattr; Getattr }
3810 3814                   * Set ctime local here (outside the do_again label)
3811 3815                   * so that subsequent retries (after failed VERIFY)
3812 3816                   * will use ctime from GETATTR results (from failed
3813 3817                   * verify compound) as VERIFY arg.
3814 3818                   * If file has delegation, then VERIFY(time_metadata)
3815 3819                   * is of little added value, so don't bother.
3816 3820                   */
3817 3821                  mutex_enter(&rp->r_statev4_lock);
3818 3822                  if (rp->r_deleg_type == OPEN_DELEGATE_NONE ||
3819 3823                      rp->r_deleg_return_pending) {
3820 3824                          numops = 5;
3821 3825                          ctime = rp->r_attr.va_ctime;
3822 3826                  }
3823 3827                  mutex_exit(&rp->r_statev4_lock);
3824 3828          }
3825 3829  
3826 3830          recov_state.rs_flags = 0;
3827 3831          recov_state.rs_num_retry_despite_err = 0;
3828 3832  
3829 3833          args.ctag = TAG_SETATTR;
3830 3834  do_again:
3831 3835  recov_retry:
3832 3836          setattr_argop = numops - 2;
3833 3837  
3834 3838          args.array = argop;
3835 3839          args.array_len = numops;
3836 3840  
3837 3841          e.error = nfs4_start_op(VTOMI4(vp), vp, NULL, &recov_state);
3838 3842          if (e.error)
3839 3843                  return (e.error);
3840 3844  
3841 3845  
3842 3846          /* putfh target fh */
3843 3847          argop[0].argop = OP_CPUTFH;
3844 3848          argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
3845 3849  
3846 3850          if (numops == 5) {
3847 3851                  /*
3848 3852                   * We only care about the ctime, but need to get mtime
3849 3853                   * and size for proper cache update.
3850 3854                   */
3851 3855                  /* getattr */
3852 3856                  argop[1].argop = OP_GETATTR;
3853 3857                  argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
3854 3858                  argop[1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp);
3855 3859  
3856 3860                  /* verify - set later in loop */
3857 3861                  verify_argop = 2;
3858 3862          }
3859 3863  
3860 3864          /* setattr */
3861 3865          svp = rp->r_server;
3862 3866          (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
3863 3867          supp_attrs = svp->sv_supp_attrs;
3864 3868          nfs_rw_exit(&svp->sv_lock);
3865 3869  
3866 3870          nfs4args_setattr(&argop[setattr_argop], vap, vsap, flags, rp, cr,
3867 3871              supp_attrs, &e.error, &sid_types);
3868 3872          stateid = argop[setattr_argop].nfs_argop4_u.opsetattr.stateid;
3869 3873          if (e.error) {
3870 3874                  /* req time field(s) overflow - return immediately */
3871 3875                  nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov);
3872 3876                  nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
3873 3877                      opsetattr.obj_attributes);
3874 3878                  return (e.error);
3875 3879          }
3876 3880          omode = rp->r_attr.va_mode;
3877 3881  
3878 3882          /* getattr */
3879 3883          argop[numops-1].argop = OP_GETATTR;
3880 3884          argop[numops-1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
3881 3885          /*
3882 3886           * If we are setting the ACL (indicated only by vsap != NULL), request
3883 3887           * the ACL in this getattr.  The ACL returned from this getattr will be
3884 3888           * used in updating the ACL cache.
3885 3889           */
3886 3890          if (vsap != NULL)
3887 3891                  argop[numops-1].nfs_argop4_u.opgetattr.attr_request |=
3888 3892                      FATTR4_ACL_MASK;
3889 3893          argop[numops-1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp);
3890 3894  
3891 3895          /*
3892 3896           * setattr iterates if the object size is set and the cached ctime
3893 3897           * does not match the file ctime. In that case, verify the ctime first.
3894 3898           */
3895 3899  
3896 3900          do {
3897 3901                  if (verify_argop != -1) {
3898 3902                          /*
3899 3903                           * Verify that the ctime match before doing setattr.
3900 3904                           */
3901 3905                          va.va_mask = AT_CTIME;
3902 3906                          va.va_ctime = ctime;
3903 3907                          svp = rp->r_server;
3904 3908                          (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
3905 3909                          supp_attrs = svp->sv_supp_attrs;
3906 3910                          nfs_rw_exit(&svp->sv_lock);
3907 3911                          e.error = nfs4args_verify(&argop[verify_argop], &va,
3908 3912                              OP_VERIFY, supp_attrs);
3909 3913                          if (e.error) {
3910 3914                                  /* req time field(s) overflow - return */
3911 3915                                  nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
3912 3916                                      needrecov);
3913 3917                                  break;
3914 3918                          }
3915 3919                  }
3916 3920  
3917 3921                  doqueue = 1;
3918 3922  
3919 3923                  t = gethrtime();
3920 3924  
3921 3925                  rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, &e);
3922 3926  
3923 3927                  /*
3924 3928                   * Purge the access cache and ACL cache if changing either the
3925 3929                   * owner of the file, the group owner, or the mode.  These may
3926 3930                   * change the access permissions of the file, so purge old
3927 3931                   * information and start over again.
3928 3932                   */
3929 3933                  if (mask & (AT_UID | AT_GID | AT_MODE)) {
3930 3934                          (void) nfs4_access_purge_rp(rp);
3931 3935                          if (rp->r_secattr != NULL) {
3932 3936                                  mutex_enter(&rp->r_statelock);
3933 3937                                  vsp = rp->r_secattr;
3934 3938                                  rp->r_secattr = NULL;
3935 3939                                  mutex_exit(&rp->r_statelock);
3936 3940                                  if (vsp != NULL)
3937 3941                                          nfs4_acl_free_cache(vsp);
3938 3942                          }
3939 3943                  }
3940 3944  
3941 3945                  /*
3942 3946                   * If res.array_len == numops, then everything succeeded,
3943 3947                   * except for possibly the final getattr.  If only the
3944 3948                   * last getattr failed, give up, and don't try recovery.
3945 3949                   */
3946 3950                  if (res.array_len == numops) {
3947 3951                          nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
3948 3952                              needrecov);
3949 3953                          if (! e.error)
3950 3954                                  resp = &res;
3951 3955                          break;
3952 3956                  }
3953 3957  
3954 3958                  /*
3955 3959                   * if either rpc call failed or completely succeeded - done
3956 3960                   */
3957 3961                  needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
3958 3962                  if (e.error) {
3959 3963                          PURGE_ATTRCACHE4(vp);
3960 3964                          if (!needrecov) {
3961 3965                                  nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
3962 3966                                      needrecov);
3963 3967                                  break;
3964 3968                          }
3965 3969                  }
3966 3970  
3967 3971                  /*
3968 3972                   * Do proper retry for OLD_STATEID outside of the normal
3969 3973                   * recovery framework.
3970 3974                   */
3971 3975                  if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID &&
3972 3976                      sid_types.cur_sid_type != SPEC_SID &&
3973 3977                      sid_types.cur_sid_type != NO_SID) {
3974 3978                          nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
3975 3979                              needrecov);
3976 3980                          nfs4_save_stateid(&stateid, &sid_types);
3977 3981                          nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
3978 3982                              opsetattr.obj_attributes);
3979 3983                          if (verify_argop != -1) {
3980 3984                                  nfs4args_verify_free(&argop[verify_argop]);
3981 3985                                  verify_argop = -1;
3982 3986                          }
3983 3987                          (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3984 3988                          goto recov_retry;
3985 3989                  }
3986 3990  
3987 3991                  if (needrecov) {
3988 3992                          bool_t abort;
3989 3993  
3990 3994                          abort = nfs4_start_recovery(&e,
3991 3995                              VTOMI4(vp), vp, NULL, NULL, NULL,
3992 3996                              OP_SETATTR, NULL, NULL, NULL);
3993 3997                          nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
3994 3998                              needrecov);
3995 3999                          /*
3996 4000                           * Do not retry if we failed with OLD_STATEID using
3997 4001                           * a special stateid.  This is done to avoid looping
3998 4002                           * with a broken server.
3999 4003                           */
4000 4004                          if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID &&
4001 4005                              (sid_types.cur_sid_type == SPEC_SID ||
4002 4006                              sid_types.cur_sid_type == NO_SID))
4003 4007                                  abort = TRUE;
4004 4008                          if (!e.error) {
4005 4009                                  if (res.status == NFS4ERR_BADOWNER)
4006 4010                                          nfs4_log_badowner(VTOMI4(vp),
4007 4011                                              OP_SETATTR);
4008 4012  
4009 4013                                  e.error = geterrno4(res.status);
4010 4014                                  (void) xdr_free(xdr_COMPOUND4res_clnt,
4011 4015                                      (caddr_t)&res);
4012 4016                          }
4013 4017                          nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
4014 4018                              opsetattr.obj_attributes);
4015 4019                          if (verify_argop != -1) {
4016 4020                                  nfs4args_verify_free(&argop[verify_argop]);
4017 4021                                  verify_argop = -1;
4018 4022                          }
4019 4023                          if (abort == FALSE) {
4020 4024                                  /*
4021 4025                                   * Need to retry all possible stateids in
4022 4026                                   * case the recovery error wasn't stateid
4023 4027                                   * related or the stateids have become
4024 4028                                   * stale (server reboot).
4025 4029                                   */
4026 4030                                  nfs4_init_stateid_types(&sid_types);
4027 4031                                  goto recov_retry;
4028 4032                          }
4029 4033                          return (e.error);
4030 4034                  }
4031 4035  
4032 4036                  /*
4033 4037                   * Need to call nfs4_end_op before nfs4getattr to
4034 4038                   * avoid potential nfs4_start_op deadlock. See RFE
4035 4039                   * 4777612.  Calls to nfs4_invalidate_pages() and
4036 4040                   * nfs4_purge_stale_fh() might also generate over the
4037 4041                   * wire calls which my cause nfs4_start_op() deadlock.
4038 4042                   */
4039 4043                  nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov);
4040 4044  
4041 4045                  /*
4042 4046                   * Check to update lease.
4043 4047                   */
4044 4048                  resp = &res;
4045 4049                  if (res.status == NFS4_OK) {
4046 4050                          break;
4047 4051                  }
4048 4052  
4049 4053                  /*
4050 4054                   * Check if verify failed to see if try again
4051 4055                   */
4052 4056                  if ((verify_argop == -1) || (res.array_len != 3)) {
4053 4057                          /*
4054 4058                           * can't continue...
4055 4059                           */
4056 4060                          if (res.status == NFS4ERR_BADOWNER)
4057 4061                                  nfs4_log_badowner(VTOMI4(vp), OP_SETATTR);
4058 4062  
4059 4063                          e.error = geterrno4(res.status);
4060 4064                  } else {
4061 4065                          /*
4062 4066                           * When the verify request fails, the client ctime is
4063 4067                           * not in sync with the server. This is the same as
4064 4068                           * the version 3 "not synchronized" error, and we
4065 4069                           * handle it in a similar manner (XXX do we need to???).
4066 4070                           * Use the ctime returned in the first getattr for
4067 4071                           * the input to the next verify.
4068 4072                           * If we couldn't get the attributes, then we give up
4069 4073                           * because we can't complete the operation as required.
4070 4074                           */
4071 4075                          garp = &res.array[1].nfs_resop4_u.opgetattr.ga_res;
4072 4076                  }
4073 4077                  if (e.error) {
4074 4078                          PURGE_ATTRCACHE4(vp);
4075 4079                          nfs4_purge_stale_fh(e.error, vp, cr);
4076 4080                  } else {
4077 4081                          /*
4078 4082                           * retry with a new verify value
4079 4083                           */
4080 4084                          ctime = garp->n4g_va.va_ctime;
4081 4085                          (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4082 4086                          resp = NULL;
4083 4087                  }
4084 4088                  if (!e.error) {
4085 4089                          nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
4086 4090                              opsetattr.obj_attributes);
4087 4091                          if (verify_argop != -1) {
4088 4092                                  nfs4args_verify_free(&argop[verify_argop]);
4089 4093                                  verify_argop = -1;
4090 4094                          }
4091 4095                          (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4092 4096                          goto do_again;
4093 4097                  }
4094 4098          } while (!e.error);
4095 4099  
4096 4100          if (e.error) {
4097 4101                  /*
4098 4102                   * If we are here, rfs4call has an irrecoverable error - return
4099 4103                   */
4100 4104                  nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
4101 4105                      opsetattr.obj_attributes);
4102 4106                  if (verify_argop != -1) {
4103 4107                          nfs4args_verify_free(&argop[verify_argop]);
4104 4108                          verify_argop = -1;
4105 4109                  }
4106 4110                  if (resp)
4107 4111                          (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
4108 4112                  return (e.error);
4109 4113          }
4110 4114  
4111 4115  
4112 4116  
4113 4117          /*
4114 4118           * If changing the size of the file, invalidate
4115 4119           * any local cached data which is no longer part
4116 4120           * of the file.  We also possibly invalidate the
4117 4121           * last page in the file.  We could use
4118 4122           * pvn_vpzero(), but this would mark the page as
4119 4123           * modified and require it to be written back to
4120 4124           * the server for no particularly good reason.
4121 4125           * This way, if we access it, then we bring it
4122 4126           * back in.  A read should be cheaper than a
4123 4127           * write.
4124 4128           */
4125 4129          if (mask & AT_SIZE) {
4126 4130                  nfs4_invalidate_pages(vp, (vap->va_size & PAGEMASK), cr);
4127 4131          }
4128 4132  
4129 4133          /* either no error or one of the postop getattr failed */
4130 4134  
4131 4135          /*
4132 4136           * XXX Perform a simplified version of wcc checking. Instead of
4133 4137           * have another getattr to get pre-op, just purge cache if
4134 4138           * any of the ops prior to and including the getattr failed.
4135 4139           * If the getattr succeeded then update the attrcache accordingly.
4136 4140           */
4137 4141  
4138 4142          garp = NULL;
4139 4143          if (res.status == NFS4_OK) {
4140 4144                  /*
4141 4145                   * Last getattr
4142 4146                   */
4143 4147                  resop = &res.array[numops - 1];
4144 4148                  garp = &resop->nfs_resop4_u.opgetattr.ga_res;
4145 4149          }
4146 4150          /*
4147 4151           * In certain cases, nfs4_update_attrcache() will purge the attrcache,
4148 4152           * rather than filling it.  See the function itself for details.
4149 4153           */
4150 4154          e.error = nfs4_update_attrcache(res.status, garp, t, vp, cr);
4151 4155          if (garp != NULL) {
4152 4156                  if (garp->n4g_resbmap & FATTR4_ACL_MASK) {
4153 4157                          nfs4_acl_fill_cache(rp, &garp->n4g_vsa);
4154 4158                          vs_ace4_destroy(&garp->n4g_vsa);
4155 4159                  } else {
4156 4160                          if (vsap != NULL) {
4157 4161                                  /*
4158 4162                                   * The ACL was supposed to be set and to be
4159 4163                                   * returned in the last getattr of this
4160 4164                                   * compound, but for some reason the getattr
4161 4165                                   * result doesn't contain the ACL.  In this
4162 4166                                   * case, purge the ACL cache.
4163 4167                                   */
4164 4168                                  if (rp->r_secattr != NULL) {
4165 4169                                          mutex_enter(&rp->r_statelock);
4166 4170                                          vsp = rp->r_secattr;
4167 4171                                          rp->r_secattr = NULL;
4168 4172                                          mutex_exit(&rp->r_statelock);
4169 4173                                          if (vsp != NULL)
4170 4174                                                  nfs4_acl_free_cache(vsp);
4171 4175                                  }
4172 4176                          }
4173 4177                  }
4174 4178          }
4175 4179  
4176 4180          if (res.status == NFS4_OK && (mask & AT_SIZE)) {
4177 4181                  /*
4178 4182                   * Set the size, rather than relying on getting it updated
4179 4183                   * via a GETATTR.  With delegations the client tries to
4180 4184                   * suppress GETATTR calls.
4181 4185                   */
4182 4186                  mutex_enter(&rp->r_statelock);
4183 4187                  rp->r_size = vap->va_size;
4184 4188                  mutex_exit(&rp->r_statelock);
4185 4189          }
4186 4190  
4187 4191          /*
4188 4192           * Can free up request args and res
4189 4193           */
4190 4194          nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
4191 4195              opsetattr.obj_attributes);
4192 4196          if (verify_argop != -1) {
4193 4197                  nfs4args_verify_free(&argop[verify_argop]);
4194 4198                  verify_argop = -1;
4195 4199          }
4196 4200          (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4197 4201  
4198 4202          /*
4199 4203           * Some servers will change the mode to clear the setuid
4200 4204           * and setgid bits when changing the uid or gid.  The
4201 4205           * client needs to compensate appropriately.
4202 4206           */
4203 4207          if (mask & (AT_UID | AT_GID)) {
4204 4208                  int terror, do_setattr;
4205 4209  
4206 4210                  do_setattr = 0;
4207 4211                  va.va_mask = AT_MODE;
4208 4212                  terror = nfs4getattr(vp, &va, cr);
4209 4213                  if (!terror &&
4210 4214                      (((mask & AT_MODE) && va.va_mode != vap->va_mode) ||
4211 4215                      (!(mask & AT_MODE) && va.va_mode != omode))) {
4212 4216                          va.va_mask = AT_MODE;
4213 4217                          if (mask & AT_MODE) {
4214 4218                                  /*
4215 4219                                   * We asked the mode to be changed and what
4216 4220                                   * we just got from the server in getattr is
4217 4221                                   * not what we wanted it to be, so set it now.
4218 4222                                   */
4219 4223                                  va.va_mode = vap->va_mode;
4220 4224                                  do_setattr = 1;
4221 4225                          } else {
4222 4226                                  /*
4223 4227                                   * We did not ask the mode to be changed,
4224 4228                                   * Check to see that the server just cleared
4225 4229                                   * I_SUID and I_GUID from it. If not then
4226 4230                                   * set mode to omode with UID/GID cleared.
4227 4231                                   */
4228 4232                                  if (nfs4_compare_modes(va.va_mode, omode)) {
4229 4233                                          omode &= ~(S_ISUID|S_ISGID);
4230 4234                                          va.va_mode = omode;
4231 4235                                          do_setattr = 1;
4232 4236                                  }
4233 4237                          }
4234 4238  
4235 4239                          if (do_setattr)
4236 4240                                  (void) nfs4setattr(vp, &va, 0, cr, NULL);
4237 4241                  }
4238 4242          }
4239 4243  
4240 4244          return (e.error);
4241 4245  }
4242 4246  
4243 4247  /* ARGSUSED */
4244 4248  static int
4245 4249  nfs4_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct)
4246 4250  {
4247 4251          COMPOUND4args_clnt args;
4248 4252          COMPOUND4res_clnt res;
4249 4253          int doqueue;
4250 4254          uint32_t acc, resacc, argacc;
4251 4255          rnode4_t *rp;
4252 4256          cred_t *cred, *ncr, *ncrfree = NULL;
4253 4257          nfs4_access_type_t cacc;
4254 4258          int num_ops;
4255 4259          nfs_argop4 argop[3];
4256 4260          nfs_resop4 *resop;
4257 4261          bool_t needrecov = FALSE, do_getattr;
4258 4262          nfs4_recov_state_t recov_state;
4259 4263          int rpc_error;
4260 4264          hrtime_t t;
4261 4265          nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
4262 4266          mntinfo4_t *mi = VTOMI4(vp);
4263 4267  
4264 4268          if (nfs_zone() != mi->mi_zone)
4265 4269                  return (EIO);
4266 4270  
4267 4271          acc = 0;
4268 4272          if (mode & VREAD)
4269 4273                  acc |= ACCESS4_READ;
4270 4274          if (mode & VWRITE) {
4271 4275                  if ((vp->v_vfsp->vfs_flag & VFS_RDONLY) && !ISVDEV(vp->v_type))
4272 4276                          return (EROFS);
4273 4277                  if (vp->v_type == VDIR)
4274 4278                          acc |= ACCESS4_DELETE;
4275 4279                  acc |= ACCESS4_MODIFY | ACCESS4_EXTEND;
4276 4280          }
4277 4281          if (mode & VEXEC) {
4278 4282                  if (vp->v_type == VDIR)
4279 4283                          acc |= ACCESS4_LOOKUP;
4280 4284                  else
4281 4285                          acc |= ACCESS4_EXECUTE;
4282 4286          }
4283 4287  
4284 4288          if (VTOR4(vp)->r_acache != NULL) {
4285 4289                  e.error = nfs4_validate_caches(vp, cr);
4286 4290                  if (e.error)
4287 4291                          return (e.error);
4288 4292          }
4289 4293  
4290 4294          rp = VTOR4(vp);
4291 4295          if (vp->v_type == VDIR)
4292 4296                  argacc = ACCESS4_READ | ACCESS4_DELETE | ACCESS4_MODIFY |
4293 4297                      ACCESS4_EXTEND | ACCESS4_LOOKUP;
4294 4298          else
4295 4299                  argacc = ACCESS4_READ | ACCESS4_MODIFY | ACCESS4_EXTEND |
4296 4300                      ACCESS4_EXECUTE;
4297 4301          recov_state.rs_flags = 0;
4298 4302          recov_state.rs_num_retry_despite_err = 0;
4299 4303  
4300 4304          cred = cr;
4301 4305          /*
4302 4306           * ncr and ncrfree both initially
4303 4307           * point to the memory area returned
4304 4308           * by crnetadjust();
4305 4309           * ncrfree not NULL when exiting means
4306 4310           * that we need to release it
4307 4311           */
4308 4312          ncr = crnetadjust(cred);
4309 4313          ncrfree = ncr;
4310 4314  
4311 4315  tryagain:
4312 4316          cacc = nfs4_access_check(rp, acc, cred);
4313 4317          if (cacc == NFS4_ACCESS_ALLOWED) {
4314 4318                  if (ncrfree != NULL)
4315 4319                          crfree(ncrfree);
4316 4320                  return (0);
4317 4321          }
4318 4322          if (cacc == NFS4_ACCESS_DENIED) {
4319 4323                  /*
4320 4324                   * If the cred can be adjusted, try again
4321 4325                   * with the new cred.
4322 4326                   */
4323 4327                  if (ncr != NULL) {
4324 4328                          cred = ncr;
4325 4329                          ncr = NULL;
4326 4330                          goto tryagain;
4327 4331                  }
4328 4332                  if (ncrfree != NULL)
4329 4333                          crfree(ncrfree);
4330 4334                  return (EACCES);
4331 4335          }
4332 4336  
4333 4337  recov_retry:
4334 4338          /*
4335 4339           * Don't take with r_statev4_lock here. r_deleg_type could
4336 4340           * change as soon as lock is released.  Since it is an int,
4337 4341           * there is no atomicity issue.
4338 4342           */
4339 4343          do_getattr = (rp->r_deleg_type == OPEN_DELEGATE_NONE);
4340 4344          num_ops = do_getattr ? 3 : 2;
4341 4345  
4342 4346          args.ctag = TAG_ACCESS;
4343 4347  
4344 4348          args.array_len = num_ops;
4345 4349          args.array = argop;
4346 4350  
4347 4351          if (e.error = nfs4_start_fop(mi, vp, NULL, OH_ACCESS,
4348 4352              &recov_state, NULL)) {
4349 4353                  if (ncrfree != NULL)
4350 4354                          crfree(ncrfree);
4351 4355                  return (e.error);
4352 4356          }
4353 4357  
4354 4358          /* putfh target fh */
4355 4359          argop[0].argop = OP_CPUTFH;
4356 4360          argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh;
4357 4361  
4358 4362          /* access */
4359 4363          argop[1].argop = OP_ACCESS;
4360 4364          argop[1].nfs_argop4_u.opaccess.access = argacc;
4361 4365  
4362 4366          /* getattr */
4363 4367          if (do_getattr) {
4364 4368                  argop[2].argop = OP_GETATTR;
4365 4369                  argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
4366 4370                  argop[2].nfs_argop4_u.opgetattr.mi = mi;
4367 4371          }
4368 4372  
4369 4373          NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
4370 4374              "nfs4_access: %s call, rp %s", needrecov ? "recov" : "first",
4371 4375              rnode4info(VTOR4(vp))));
4372 4376  
4373 4377          doqueue = 1;
4374 4378          t = gethrtime();
4375 4379          rfs4call(VTOMI4(vp), &args, &res, cred, &doqueue, 0, &e);
4376 4380          rpc_error = e.error;
4377 4381  
4378 4382          needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
4379 4383          if (needrecov) {
4380 4384                  NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
4381 4385                      "nfs4_access: initiating recovery\n"));
4382 4386  
4383 4387                  if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
4384 4388                      NULL, OP_ACCESS, NULL, NULL, NULL) == FALSE) {
4385 4389                          nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_ACCESS,
4386 4390                              &recov_state, needrecov);
4387 4391                          if (!e.error)
4388 4392                                  (void) xdr_free(xdr_COMPOUND4res_clnt,
4389 4393                                      (caddr_t)&res);
4390 4394                          goto recov_retry;
4391 4395                  }
4392 4396          }
4393 4397          nfs4_end_fop(mi, vp, NULL, OH_ACCESS, &recov_state, needrecov);
4394 4398  
4395 4399          if (e.error)
4396 4400                  goto out;
4397 4401  
4398 4402          if (res.status) {
4399 4403                  e.error = geterrno4(res.status);
4400 4404                  /*
4401 4405                   * This might generate over the wire calls throught
4402 4406                   * nfs4_invalidate_pages. Hence we need to call nfs4_end_op()
4403 4407                   * here to avoid a deadlock.
4404 4408                   */
4405 4409                  nfs4_purge_stale_fh(e.error, vp, cr);
4406 4410                  goto out;
4407 4411          }
4408 4412          resop = &res.array[1];  /* access res */
4409 4413  
4410 4414          resacc = resop->nfs_resop4_u.opaccess.access;
4411 4415  
4412 4416          if (do_getattr) {
4413 4417                  resop++;        /* getattr res */
4414 4418                  nfs4_attr_cache(vp, &resop->nfs_resop4_u.opgetattr.ga_res,
4415 4419                      t, cr, FALSE, NULL);
4416 4420          }
4417 4421  
4418 4422          if (!e.error) {
4419 4423                  nfs4_access_cache(rp, argacc, resacc, cred);
4420 4424                  /*
4421 4425                   * we just cached results with cred; if cred is the
4422 4426                   * adjusted credentials from crnetadjust, we do not want
4423 4427                   * to release them before exiting: hence setting ncrfree
4424 4428                   * to NULL
4425 4429                   */
4426 4430                  if (cred != cr)
4427 4431                          ncrfree = NULL;
4428 4432                  /* XXX check the supported bits too? */
4429 4433                  if ((acc & resacc) != acc) {
4430 4434                          /*
4431 4435                           * The following code implements the semantic
4432 4436                           * that a setuid root program has *at least* the
4433 4437                           * permissions of the user that is running the
4434 4438                           * program.  See rfs3call() for more portions
4435 4439                           * of the implementation of this functionality.
4436 4440                           */
4437 4441                          /* XXX-LP */
4438 4442                          if (ncr != NULL) {
4439 4443                                  (void) xdr_free(xdr_COMPOUND4res_clnt,
4440 4444                                      (caddr_t)&res);
4441 4445                                  cred = ncr;
4442 4446                                  ncr = NULL;
4443 4447                                  goto tryagain;
4444 4448                          }
4445 4449                          e.error = EACCES;
4446 4450                  }
4447 4451          }
4448 4452  
4449 4453  out:
4450 4454          if (!rpc_error)
4451 4455                  (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4452 4456  
4453 4457          if (ncrfree != NULL)
4454 4458                  crfree(ncrfree);
4455 4459  
4456 4460          return (e.error);
4457 4461  }
4458 4462  
4459 4463  /* ARGSUSED */
4460 4464  static int
4461 4465  nfs4_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr, caller_context_t *ct)
4462 4466  {
4463 4467          COMPOUND4args_clnt args;
4464 4468          COMPOUND4res_clnt res;
4465 4469          int doqueue;
4466 4470          rnode4_t *rp;
4467 4471          nfs_argop4 argop[3];
4468 4472          nfs_resop4 *resop;
4469 4473          READLINK4res *lr_res;
4470 4474          nfs4_ga_res_t *garp;
4471 4475          uint_t len;
4472 4476          char *linkdata;
4473 4477          bool_t needrecov = FALSE;
4474 4478          nfs4_recov_state_t recov_state;
4475 4479          hrtime_t t;
4476 4480          nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
4477 4481  
4478 4482          if (nfs_zone() != VTOMI4(vp)->mi_zone)
4479 4483                  return (EIO);
4480 4484          /*
4481 4485           * Can't readlink anything other than a symbolic link.
4482 4486           */
4483 4487          if (vp->v_type != VLNK)
4484 4488                  return (EINVAL);
4485 4489  
4486 4490          rp = VTOR4(vp);
4487 4491          if (nfs4_do_symlink_cache && rp->r_symlink.contents != NULL) {
4488 4492                  e.error = nfs4_validate_caches(vp, cr);
4489 4493                  if (e.error)
4490 4494                          return (e.error);
4491 4495                  mutex_enter(&rp->r_statelock);
4492 4496                  if (rp->r_symlink.contents != NULL) {
4493 4497                          e.error = uiomove(rp->r_symlink.contents,
4494 4498                              rp->r_symlink.len, UIO_READ, uiop);
4495 4499                          mutex_exit(&rp->r_statelock);
4496 4500                          return (e.error);
4497 4501                  }
4498 4502                  mutex_exit(&rp->r_statelock);
4499 4503          }
4500 4504          recov_state.rs_flags = 0;
4501 4505          recov_state.rs_num_retry_despite_err = 0;
4502 4506  
4503 4507  recov_retry:
4504 4508          args.array_len = 3;
4505 4509          args.array = argop;
4506 4510          args.ctag = TAG_READLINK;
4507 4511  
4508 4512          e.error = nfs4_start_op(VTOMI4(vp), vp, NULL, &recov_state);
4509 4513          if (e.error) {
4510 4514                  return (e.error);
4511 4515          }
4512 4516  
4513 4517          /* 0. putfh symlink fh */
4514 4518          argop[0].argop = OP_CPUTFH;
4515 4519          argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh;
4516 4520  
4517 4521          /* 1. readlink */
4518 4522          argop[1].argop = OP_READLINK;
4519 4523  
4520 4524          /* 2. getattr */
4521 4525          argop[2].argop = OP_GETATTR;
4522 4526          argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
4523 4527          argop[2].nfs_argop4_u.opgetattr.mi = VTOMI4(vp);
4524 4528  
4525 4529          doqueue = 1;
4526 4530  
4527 4531          NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
4528 4532              "nfs4_readlink: %s call, rp %s", needrecov ? "recov" : "first",
4529 4533              rnode4info(VTOR4(vp))));
4530 4534  
4531 4535          t = gethrtime();
4532 4536  
4533 4537          rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, &e);
4534 4538  
4535 4539          needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
4536 4540          if (needrecov) {
4537 4541                  NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
4538 4542                      "nfs4_readlink: initiating recovery\n"));
4539 4543  
4540 4544                  if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
4541 4545                      NULL, OP_READLINK, NULL, NULL, NULL) == FALSE) {
4542 4546                          if (!e.error)
4543 4547                                  (void) xdr_free(xdr_COMPOUND4res_clnt,
4544 4548                                      (caddr_t)&res);
4545 4549  
4546 4550                          nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
4547 4551                              needrecov);
4548 4552                          goto recov_retry;
4549 4553                  }
4550 4554          }
4551 4555  
4552 4556          nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov);
4553 4557  
4554 4558          if (e.error)
4555 4559                  return (e.error);
4556 4560  
4557 4561          /*
4558 4562           * There is an path in the code below which calls
4559 4563           * nfs4_purge_stale_fh(), which may generate otw calls through
4560 4564           * nfs4_invalidate_pages. Hence we need to call nfs4_end_op()
4561 4565           * here to avoid nfs4_start_op() deadlock.
4562 4566           */
4563 4567  
4564 4568          if (res.status && (res.array_len < args.array_len)) {
4565 4569                  /*
4566 4570                   * either Putfh or Link failed
4567 4571                   */
4568 4572                  e.error = geterrno4(res.status);
4569 4573                  nfs4_purge_stale_fh(e.error, vp, cr);
4570 4574                  (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4571 4575                  return (e.error);
4572 4576          }
4573 4577  
4574 4578          resop = &res.array[1];  /* readlink res */
4575 4579          lr_res = &resop->nfs_resop4_u.opreadlink;
4576 4580  
4577 4581          /*
4578 4582           * treat symlink names as data
4579 4583           */
4580 4584          linkdata = utf8_to_str(&lr_res->link, &len, NULL);
4581 4585          if (linkdata != NULL) {
4582 4586                  int uio_len = len - 1;
4583 4587                  /* len includes null byte, which we won't uiomove */
4584 4588                  e.error = uiomove(linkdata, uio_len, UIO_READ, uiop);
4585 4589                  if (nfs4_do_symlink_cache && rp->r_symlink.contents == NULL) {
4586 4590                          mutex_enter(&rp->r_statelock);
4587 4591                          if (rp->r_symlink.contents == NULL) {
4588 4592                                  rp->r_symlink.contents = linkdata;
4589 4593                                  rp->r_symlink.len = uio_len;
4590 4594                                  rp->r_symlink.size = len;
4591 4595                                  mutex_exit(&rp->r_statelock);
4592 4596                          } else {
4593 4597                                  mutex_exit(&rp->r_statelock);
4594 4598                                  kmem_free(linkdata, len);
4595 4599                          }
4596 4600                  } else {
4597 4601                          kmem_free(linkdata, len);
4598 4602                  }
4599 4603          }
4600 4604          if (res.status == NFS4_OK) {
4601 4605                  resop++;        /* getattr res */
4602 4606                  garp = &resop->nfs_resop4_u.opgetattr.ga_res;
4603 4607          }
4604 4608          e.error = nfs4_update_attrcache(res.status, garp, t, vp, cr);
4605 4609  
4606 4610          (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4607 4611  
4608 4612          /*
4609 4613           * The over the wire error for attempting to readlink something
4610 4614           * other than a symbolic link is ENXIO.  However, we need to
4611 4615           * return EINVAL instead of ENXIO, so we map it here.
4612 4616           */
4613 4617          return (e.error == ENXIO ? EINVAL : e.error);
4614 4618  }
4615 4619  
4616 4620  /*
4617 4621   * Flush local dirty pages to stable storage on the server.
4618 4622   *
4619 4623   * If FNODSYNC is specified, then there is nothing to do because
4620 4624   * metadata changes are not cached on the client before being
4621 4625   * sent to the server.
4622 4626   */
4623 4627  /* ARGSUSED */
4624 4628  static int
4625 4629  nfs4_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
4626 4630  {
4627 4631          int error;
4628 4632  
4629 4633          if ((syncflag & FNODSYNC) || IS_SWAPVP(vp))
4630 4634                  return (0);
4631 4635          if (nfs_zone() != VTOMI4(vp)->mi_zone)
4632 4636                  return (EIO);
4633 4637          error = nfs4_putpage_commit(vp, (offset_t)0, 0, cr);
4634 4638          if (!error)
4635 4639                  error = VTOR4(vp)->r_error;
4636 4640          return (error);
4637 4641  }
4638 4642  
4639 4643  /*
4640 4644   * Weirdness: if the file was removed or the target of a rename
4641 4645   * operation while it was open, it got renamed instead.  Here we
4642 4646   * remove the renamed file.
4643 4647   */
4644 4648  /* ARGSUSED */
4645 4649  void
4646 4650  nfs4_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4647 4651  {
4648 4652          rnode4_t *rp;
4649 4653  
4650 4654          ASSERT(vp != DNLC_NO_VNODE);
4651 4655  
4652 4656          rp = VTOR4(vp);
4653 4657  
4654 4658          if (IS_SHADOW(vp, rp)) {
4655 4659                  sv_inactive(vp);
4656 4660                  return;
4657 4661          }
4658 4662  
4659 4663          /*
4660 4664           * If this is coming from the wrong zone, we let someone in the right
4661 4665           * zone take care of it asynchronously.  We can get here due to
4662 4666           * VN_RELE() being called from pageout() or fsflush().  This call may
4663 4667           * potentially turn into an expensive no-op if, for instance, v_count
4664 4668           * gets incremented in the meantime, but it's still correct.
4665 4669           */
4666 4670          if (nfs_zone() != VTOMI4(vp)->mi_zone) {
4667 4671                  nfs4_async_inactive(vp, cr);
4668 4672                  return;
4669 4673          }
4670 4674  
4671 4675          /*
4672 4676           * Some of the cleanup steps might require over-the-wire
4673 4677           * operations.  Since VOP_INACTIVE can get called as a result of
4674 4678           * other over-the-wire operations (e.g., an attribute cache update
4675 4679           * can lead to a DNLC purge), doing those steps now would lead to a
4676 4680           * nested call to the recovery framework, which can deadlock.  So
4677 4681           * do any over-the-wire cleanups asynchronously, in a separate
4678 4682           * thread.
4679 4683           */
4680 4684  
4681 4685          mutex_enter(&rp->r_os_lock);
4682 4686          mutex_enter(&rp->r_statelock);
4683 4687          mutex_enter(&rp->r_statev4_lock);
4684 4688  
4685 4689          if (vp->v_type == VREG && list_head(&rp->r_open_streams) != NULL) {
4686 4690                  mutex_exit(&rp->r_statev4_lock);
4687 4691                  mutex_exit(&rp->r_statelock);
4688 4692                  mutex_exit(&rp->r_os_lock);
4689 4693                  nfs4_async_inactive(vp, cr);
4690 4694                  return;
4691 4695          }
4692 4696  
4693 4697          if (rp->r_deleg_type == OPEN_DELEGATE_READ ||
4694 4698              rp->r_deleg_type == OPEN_DELEGATE_WRITE) {
4695 4699                  mutex_exit(&rp->r_statev4_lock);
4696 4700                  mutex_exit(&rp->r_statelock);
4697 4701                  mutex_exit(&rp->r_os_lock);
4698 4702                  nfs4_async_inactive(vp, cr);
4699 4703                  return;
4700 4704          }
4701 4705  
4702 4706          if (rp->r_unldvp != NULL) {
4703 4707                  mutex_exit(&rp->r_statev4_lock);
4704 4708                  mutex_exit(&rp->r_statelock);
4705 4709                  mutex_exit(&rp->r_os_lock);
4706 4710                  nfs4_async_inactive(vp, cr);
4707 4711                  return;
4708 4712          }
4709 4713          mutex_exit(&rp->r_statev4_lock);
4710 4714          mutex_exit(&rp->r_statelock);
4711 4715          mutex_exit(&rp->r_os_lock);
4712 4716  
4713 4717          rp4_addfree(rp, cr);
4714 4718  }
4715 4719  
4716 4720  /*
4717 4721   * nfs4_inactive_otw - nfs4_inactive, plus over-the-wire calls to free up
4718 4722   * various bits of state.  The caller must not refer to vp after this call.
4719 4723   */
4720 4724  
4721 4725  void
4722 4726  nfs4_inactive_otw(vnode_t *vp, cred_t *cr)
4723 4727  {
4724 4728          rnode4_t *rp = VTOR4(vp);
4725 4729          nfs4_recov_state_t recov_state;
4726 4730          nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
4727 4731          vnode_t *unldvp;
4728 4732          char *unlname;
4729 4733          cred_t *unlcred;
4730 4734          COMPOUND4args_clnt args;
4731 4735          COMPOUND4res_clnt res, *resp;
4732 4736          nfs_argop4 argop[2];
4733 4737          int doqueue;
4734 4738  #ifdef DEBUG
4735 4739          char *name;
4736 4740  #endif
4737 4741  
4738 4742          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
4739 4743          ASSERT(!IS_SHADOW(vp, rp));
4740 4744  
4741 4745  #ifdef DEBUG
4742 4746          name = fn_name(VTOSV(vp)->sv_name);
4743 4747          NFS4_DEBUG(nfs4_client_inactive_debug, (CE_NOTE, "nfs4_inactive_otw: "
4744 4748              "release vnode %s", name));
4745 4749          kmem_free(name, MAXNAMELEN);
4746 4750  #endif
4747 4751  
4748 4752          if (vp->v_type == VREG) {
4749 4753                  bool_t recov_failed = FALSE;
4750 4754  
4751 4755                  e.error = nfs4close_all(vp, cr);
4752 4756                  if (e.error) {
4753 4757                          /* Check to see if recovery failed */
4754 4758                          mutex_enter(&(VTOMI4(vp)->mi_lock));
4755 4759                          if (VTOMI4(vp)->mi_flags & MI4_RECOV_FAIL)
4756 4760                                  recov_failed = TRUE;
4757 4761                          mutex_exit(&(VTOMI4(vp)->mi_lock));
4758 4762                          if (!recov_failed) {
4759 4763                                  mutex_enter(&rp->r_statelock);
4760 4764                                  if (rp->r_flags & R4RECOVERR)
4761 4765                                          recov_failed = TRUE;
4762 4766                                  mutex_exit(&rp->r_statelock);
4763 4767                          }
4764 4768                          if (recov_failed) {
4765 4769                                  NFS4_DEBUG(nfs4_client_recov_debug,
4766 4770                                      (CE_NOTE, "nfs4_inactive_otw: "
4767 4771                                      "close failed (recovery failure)"));
4768 4772                          }
4769 4773                  }
4770 4774          }
4771 4775  
4772 4776  redo:
4773 4777          if (rp->r_unldvp == NULL) {
4774 4778                  rp4_addfree(rp, cr);
4775 4779                  return;
4776 4780          }
4777 4781  
4778 4782          /*
4779 4783           * Save the vnode pointer for the directory where the
4780 4784           * unlinked-open file got renamed, then set it to NULL
4781 4785           * to prevent another thread from getting here before
4782 4786           * we're done with the remove.  While we have the
4783 4787           * statelock, make local copies of the pertinent rnode
4784 4788           * fields.  If we weren't to do this in an atomic way, the
4785 4789           * the unl* fields could become inconsistent with respect
4786 4790           * to each other due to a race condition between this
4787 4791           * code and nfs_remove().  See bug report 1034328.
4788 4792           */
4789 4793          mutex_enter(&rp->r_statelock);
4790 4794          if (rp->r_unldvp == NULL) {
4791 4795                  mutex_exit(&rp->r_statelock);
4792 4796                  rp4_addfree(rp, cr);
4793 4797                  return;
4794 4798          }
4795 4799  
4796 4800          unldvp = rp->r_unldvp;
4797 4801          rp->r_unldvp = NULL;
4798 4802          unlname = rp->r_unlname;
4799 4803          rp->r_unlname = NULL;
4800 4804          unlcred = rp->r_unlcred;
4801 4805          rp->r_unlcred = NULL;
4802 4806          mutex_exit(&rp->r_statelock);
4803 4807  
4804 4808          /*
4805 4809           * If there are any dirty pages left, then flush
4806 4810           * them.  This is unfortunate because they just
4807 4811           * may get thrown away during the remove operation,
4808 4812           * but we have to do this for correctness.
4809 4813           */
4810 4814          if (nfs4_has_pages(vp) &&
4811 4815              ((rp->r_flags & R4DIRTY) || rp->r_count > 0)) {
4812 4816                  ASSERT(vp->v_type != VCHR);
4813 4817                  e.error = nfs4_putpage(vp, (u_offset_t)0, 0, 0, cr, NULL);
4814 4818                  if (e.error) {
4815 4819                          mutex_enter(&rp->r_statelock);
4816 4820                          if (!rp->r_error)
4817 4821                                  rp->r_error = e.error;
4818 4822                          mutex_exit(&rp->r_statelock);
4819 4823                  }
4820 4824          }
4821 4825  
4822 4826          recov_state.rs_flags = 0;
4823 4827          recov_state.rs_num_retry_despite_err = 0;
4824 4828  recov_retry_remove:
4825 4829          /*
4826 4830           * Do the remove operation on the renamed file
4827 4831           */
4828 4832          args.ctag = TAG_INACTIVE;
4829 4833  
4830 4834          /*
4831 4835           * Remove ops: putfh dir; remove
4832 4836           */
4833 4837          args.array_len = 2;
4834 4838          args.array = argop;
4835 4839  
4836 4840          e.error = nfs4_start_op(VTOMI4(unldvp), unldvp, NULL, &recov_state);
4837 4841          if (e.error) {
4838 4842                  kmem_free(unlname, MAXNAMELEN);
4839 4843                  crfree(unlcred);
4840 4844                  VN_RELE(unldvp);
4841 4845                  /*
4842 4846                   * Try again; this time around r_unldvp will be NULL, so we'll
4843 4847                   * just call rp4_addfree() and return.
4844 4848                   */
4845 4849                  goto redo;
4846 4850          }
4847 4851  
4848 4852          /* putfh directory */
4849 4853          argop[0].argop = OP_CPUTFH;
4850 4854          argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(unldvp)->r_fh;
4851 4855  
4852 4856          /* remove */
4853 4857          argop[1].argop = OP_CREMOVE;
4854 4858          argop[1].nfs_argop4_u.opcremove.ctarget = unlname;
4855 4859  
4856 4860          doqueue = 1;
4857 4861          resp = &res;
4858 4862  
4859 4863  #if 0 /* notyet */
4860 4864          /*
4861 4865           * Can't do this yet.  We may be being called from
4862 4866           * dnlc_purge_XXX while that routine is holding a
4863 4867           * mutex lock to the nc_rele list.  The calls to
4864 4868           * nfs3_cache_wcc_data may result in calls to
4865 4869           * dnlc_purge_XXX.  This will result in a deadlock.
4866 4870           */
4867 4871          rfs4call(VTOMI4(unldvp), &args, &res, unlcred, &doqueue, 0, &e);
4868 4872          if (e.error) {
4869 4873                  PURGE_ATTRCACHE4(unldvp);
4870 4874                  resp = NULL;
4871 4875          } else if (res.status) {
4872 4876                  e.error = geterrno4(res.status);
4873 4877                  PURGE_ATTRCACHE4(unldvp);
4874 4878                  /*
4875 4879                   * This code is inactive right now
4876 4880                   * but if made active there should
4877 4881                   * be a nfs4_end_op() call before
4878 4882                   * nfs4_purge_stale_fh to avoid start_op()
4879 4883                   * deadlock. See BugId: 4948726
4880 4884                   */
4881 4885                  nfs4_purge_stale_fh(error, unldvp, cr);
4882 4886          } else {
4883 4887                  nfs_resop4 *resop;
4884 4888                  REMOVE4res *rm_res;
4885 4889  
4886 4890                  resop = &res.array[1];
4887 4891                  rm_res = &resop->nfs_resop4_u.opremove;
4888 4892                  /*
4889 4893                   * Update directory cache attribute,
4890 4894                   * readdir and dnlc caches.
4891 4895                   */
4892 4896                  nfs4_update_dircaches(&rm_res->cinfo, unldvp, NULL, NULL, NULL);
4893 4897          }
4894 4898  #else
4895 4899          rfs4call(VTOMI4(unldvp), &args, &res, unlcred, &doqueue, 0, &e);
4896 4900  
4897 4901          PURGE_ATTRCACHE4(unldvp);
4898 4902  #endif
4899 4903  
4900 4904          if (nfs4_needs_recovery(&e, FALSE, unldvp->v_vfsp)) {
4901 4905                  if (nfs4_start_recovery(&e, VTOMI4(unldvp), unldvp, NULL,
4902 4906                      NULL, NULL, OP_REMOVE, NULL, NULL, NULL) == FALSE) {
4903 4907                          if (!e.error)
4904 4908                                  (void) xdr_free(xdr_COMPOUND4res_clnt,
4905 4909                                      (caddr_t)&res);
4906 4910                          nfs4_end_op(VTOMI4(unldvp), unldvp, NULL,
4907 4911                              &recov_state, TRUE);
4908 4912                          goto recov_retry_remove;
4909 4913                  }
4910 4914          }
4911 4915          nfs4_end_op(VTOMI4(unldvp), unldvp, NULL, &recov_state, FALSE);
4912 4916  
4913 4917          /*
4914 4918           * Release stuff held for the remove
4915 4919           */
4916 4920          VN_RELE(unldvp);
4917 4921          if (!e.error && resp)
4918 4922                  (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
4919 4923  
4920 4924          kmem_free(unlname, MAXNAMELEN);
4921 4925          crfree(unlcred);
4922 4926          goto redo;
4923 4927  }
4924 4928  
4925 4929  /*
4926 4930   * Remote file system operations having to do with directory manipulation.
4927 4931   */
4928 4932  /* ARGSUSED3 */
4929 4933  int
4930 4934  nfs4_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
4931 4935      int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
4932 4936      int *direntflags, pathname_t *realpnp)
4933 4937  {
4934 4938          int error;
4935 4939          vnode_t *vp, *avp = NULL;
4936 4940          rnode4_t *drp;
4937 4941  
4938 4942          *vpp = NULL;
4939 4943          if (nfs_zone() != VTOMI4(dvp)->mi_zone)
4940 4944                  return (EPERM);
4941 4945          /*
4942 4946           * if LOOKUP_XATTR, must replace dvp (object) with
4943 4947           * object's attrdir before continuing with lookup
4944 4948           */
4945 4949          if (flags & LOOKUP_XATTR) {
4946 4950                  error = nfs4lookup_xattr(dvp, nm, &avp, flags, cr);
4947 4951                  if (error)
4948 4952                          return (error);
4949 4953  
4950 4954                  dvp = avp;
4951 4955  
4952 4956                  /*
4953 4957                   * If lookup is for "", just return dvp now.  The attrdir
4954 4958                   * has already been activated (from nfs4lookup_xattr), and
4955 4959                   * the caller will RELE the original dvp -- not
4956 4960                   * the attrdir.  So, set vpp and return.
4957 4961                   * Currently, when the LOOKUP_XATTR flag is
4958 4962                   * passed to VOP_LOOKUP, the name is always empty, and
4959 4963                   * shortcircuiting here avoids 3 unneeded lock/unlock
4960 4964                   * pairs.
4961 4965                   *
4962 4966                   * If a non-empty name was provided, then it is the
4963 4967                   * attribute name, and it will be looked up below.
4964 4968                   */
4965 4969                  if (*nm == '\0') {
4966 4970                          *vpp = dvp;
4967 4971                          return (0);
4968 4972                  }
4969 4973  
4970 4974                  /*
4971 4975                   * The vfs layer never sends a name when asking for the
4972 4976                   * attrdir, so we should never get here (unless of course
4973 4977                   * name is passed at some time in future -- at which time
4974 4978                   * we'll blow up here).
4975 4979                   */
4976 4980                  ASSERT(0);
4977 4981          }
4978 4982  
4979 4983          drp = VTOR4(dvp);
4980 4984          if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp)))
4981 4985                  return (EINTR);
4982 4986  
4983 4987          error = nfs4lookup(dvp, nm, vpp, cr, 0);
4984 4988          nfs_rw_exit(&drp->r_rwlock);
4985 4989  
4986 4990          /*
4987 4991           * If vnode is a device, create special vnode.
4988 4992           */
4989 4993          if (!error && ISVDEV((*vpp)->v_type)) {
4990 4994                  vp = *vpp;
4991 4995                  *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
4992 4996                  VN_RELE(vp);
4993 4997          }
4994 4998  
4995 4999          return (error);
4996 5000  }
4997 5001  
4998 5002  /* ARGSUSED */
4999 5003  static int
5000 5004  nfs4lookup_xattr(vnode_t *dvp, char *nm, vnode_t **vpp, int flags, cred_t *cr)
5001 5005  {
5002 5006          int error;
5003 5007          rnode4_t *drp;
5004 5008          int cflag = ((flags & CREATE_XATTR_DIR) != 0);
5005 5009          mntinfo4_t *mi;
5006 5010  
5007 5011          mi = VTOMI4(dvp);
5008 5012          if (!(mi->mi_vfsp->vfs_flag & VFS_XATTR) &&
5009 5013              !vfs_has_feature(mi->mi_vfsp, VFSFT_SYSATTR_VIEWS))
5010 5014                  return (EINVAL);
5011 5015  
5012 5016          drp = VTOR4(dvp);
5013 5017          if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp)))
5014 5018                  return (EINTR);
5015 5019  
5016 5020          mutex_enter(&drp->r_statelock);
5017 5021          /*
5018 5022           * If the server doesn't support xattrs just return EINVAL
5019 5023           */
5020 5024          if (drp->r_xattr_dir == NFS4_XATTR_DIR_NOTSUPP) {
5021 5025                  mutex_exit(&drp->r_statelock);
5022 5026                  nfs_rw_exit(&drp->r_rwlock);
5023 5027                  return (EINVAL);
5024 5028          }
5025 5029  
5026 5030          /*
5027 5031           * If there is a cached xattr directory entry,
5028 5032           * use it as long as the attributes are valid. If the
5029 5033           * attributes are not valid, take the simple approach and
5030 5034           * free the cached value and re-fetch a new value.
5031 5035           *
5032 5036           * We don't negative entry cache for now, if we did we
5033 5037           * would need to check if the file has changed on every
5034 5038           * lookup. But xattrs don't exist very often and failing
5035 5039           * an openattr is not much more expensive than and NVERIFY or GETATTR
5036 5040           * so do an openattr over the wire for now.
5037 5041           */
5038 5042          if (drp->r_xattr_dir != NULL) {
5039 5043                  if (ATTRCACHE4_VALID(dvp)) {
5040 5044                          VN_HOLD(drp->r_xattr_dir);
5041 5045                          *vpp = drp->r_xattr_dir;
5042 5046                          mutex_exit(&drp->r_statelock);
5043 5047                          nfs_rw_exit(&drp->r_rwlock);
5044 5048                          return (0);
5045 5049                  }
5046 5050                  VN_RELE(drp->r_xattr_dir);
5047 5051                  drp->r_xattr_dir = NULL;
5048 5052          }
5049 5053          mutex_exit(&drp->r_statelock);
5050 5054  
5051 5055          error = nfs4openattr(dvp, vpp, cflag, cr);
5052 5056  
5053 5057          nfs_rw_exit(&drp->r_rwlock);
5054 5058  
5055 5059          return (error);
5056 5060  }
5057 5061  
5058 5062  static int
5059 5063  nfs4lookup(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr, int skipdnlc)
5060 5064  {
5061 5065          int error;
5062 5066          rnode4_t *drp;
5063 5067  
5064 5068          ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
5065 5069  
5066 5070          /*
5067 5071           * If lookup is for "", just return dvp.  Don't need
5068 5072           * to send it over the wire, look it up in the dnlc,
5069 5073           * or perform any access checks.
5070 5074           */
5071 5075          if (*nm == '\0') {
5072 5076                  VN_HOLD(dvp);
5073 5077                  *vpp = dvp;
5074 5078                  return (0);
5075 5079          }
5076 5080  
5077 5081          /*
5078 5082           * Can't do lookups in non-directories.
5079 5083           */
5080 5084          if (dvp->v_type != VDIR)
5081 5085                  return (ENOTDIR);
5082 5086  
5083 5087          /*
5084 5088           * If lookup is for ".", just return dvp.  Don't need
5085 5089           * to send it over the wire or look it up in the dnlc,
5086 5090           * just need to check access.
5087 5091           */
5088 5092          if (nm[0] == '.' && nm[1] == '\0') {
5089 5093                  error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
5090 5094                  if (error)
5091 5095                          return (error);
5092 5096                  VN_HOLD(dvp);
5093 5097                  *vpp = dvp;
5094 5098                  return (0);
5095 5099          }
5096 5100  
5097 5101          drp = VTOR4(dvp);
5098 5102          if (!(drp->r_flags & R4LOOKUP)) {
5099 5103                  mutex_enter(&drp->r_statelock);
5100 5104                  drp->r_flags |= R4LOOKUP;
5101 5105                  mutex_exit(&drp->r_statelock);
5102 5106          }
5103 5107  
5104 5108          *vpp = NULL;
5105 5109          /*
5106 5110           * Lookup this name in the DNLC.  If there is no entry
5107 5111           * lookup over the wire.
5108 5112           */
5109 5113          if (!skipdnlc)
5110 5114                  *vpp = dnlc_lookup(dvp, nm);
5111 5115          if (*vpp == NULL) {
5112 5116                  /*
5113 5117                   * We need to go over the wire to lookup the name.
5114 5118                   */
5115 5119                  return (nfs4lookupnew_otw(dvp, nm, vpp, cr));
5116 5120          }
5117 5121  
5118 5122          /*
5119 5123           * We hit on the dnlc
5120 5124           */
5121 5125          if (*vpp != DNLC_NO_VNODE ||
5122 5126              (dvp->v_vfsp->vfs_flag & VFS_RDONLY)) {
5123 5127                  /*
5124 5128                   * But our attrs may not be valid.
5125 5129                   */
5126 5130                  if (ATTRCACHE4_VALID(dvp)) {
5127 5131                          error = nfs4_waitfor_purge_complete(dvp);
5128 5132                          if (error) {
5129 5133                                  VN_RELE(*vpp);
5130 5134                                  *vpp = NULL;
5131 5135                                  return (error);
5132 5136                          }
5133 5137  
5134 5138                          /*
5135 5139                           * If after the purge completes, check to make sure
5136 5140                           * our attrs are still valid.
5137 5141                           */
5138 5142                          if (ATTRCACHE4_VALID(dvp)) {
5139 5143                                  /*
5140 5144                                   * If we waited for a purge we may have
5141 5145                                   * lost our vnode so look it up again.
5142 5146                                   */
5143 5147                                  VN_RELE(*vpp);
5144 5148                                  *vpp = dnlc_lookup(dvp, nm);
5145 5149                                  if (*vpp == NULL)
5146 5150                                          return (nfs4lookupnew_otw(dvp,
5147 5151                                              nm, vpp, cr));
5148 5152  
5149 5153                                  /*
5150 5154                                   * The access cache should almost always hit
5151 5155                                   */
5152 5156                                  error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
5153 5157  
5154 5158                                  if (error) {
5155 5159                                          VN_RELE(*vpp);
5156 5160                                          *vpp = NULL;
5157 5161                                          return (error);
5158 5162                                  }
5159 5163                                  if (*vpp == DNLC_NO_VNODE) {
5160 5164                                          VN_RELE(*vpp);
5161 5165                                          *vpp = NULL;
5162 5166                                          return (ENOENT);
5163 5167                                  }
5164 5168                                  return (0);
5165 5169                          }
5166 5170                  }
5167 5171          }
5168 5172  
5169 5173          ASSERT(*vpp != NULL);
5170 5174  
5171 5175          /*
5172 5176           * We may have gotten here we have one of the following cases:
5173 5177           *      1) vpp != DNLC_NO_VNODE, our attrs have timed out so we
5174 5178           *              need to validate them.
5175 5179           *      2) vpp == DNLC_NO_VNODE, a negative entry that we always
5176 5180           *              must validate.
5177 5181           *
5178 5182           * Go to the server and check if the directory has changed, if
5179 5183           * it hasn't we are done and can use the dnlc entry.
5180 5184           */
5181 5185          return (nfs4lookupvalidate_otw(dvp, nm, vpp, cr));
5182 5186  }
5183 5187  
5184 5188  /*
5185 5189   * Go to the server and check if the directory has changed, if
5186 5190   * it hasn't we are done and can use the dnlc entry.  If it
5187 5191   * has changed we get a new copy of its attributes and check
5188 5192   * the access for VEXEC, then relookup the filename and
5189 5193   * get its filehandle and attributes.
5190 5194   *
5191 5195   * PUTFH dfh NVERIFY GETATTR ACCESS LOOKUP GETFH GETATTR
5192 5196   *      if the NVERIFY failed we must
5193 5197   *              purge the caches
5194 5198   *              cache new attributes (will set r_time_attr_inval)
5195 5199   *              cache new access
5196 5200   *              recheck VEXEC access
5197 5201   *              add name to dnlc, possibly negative
5198 5202   *              if LOOKUP succeeded
5199 5203   *                      cache new attributes
5200 5204   *      else
5201 5205   *              set a new r_time_attr_inval for dvp
5202 5206   *              check to make sure we have access
5203 5207   *
5204 5208   * The vpp returned is the vnode passed in if the directory is valid,
5205 5209   * a new vnode if successful lookup, or NULL on error.
5206 5210   */
5207 5211  static int
5208 5212  nfs4lookupvalidate_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr)
5209 5213  {
5210 5214          COMPOUND4args_clnt args;
5211 5215          COMPOUND4res_clnt res;
5212 5216          fattr4 *ver_fattr;
5213 5217          fattr4_change dchange;
5214 5218          int32_t *ptr;
5215 5219          int argoplist_size  = 7 * sizeof (nfs_argop4);
5216 5220          nfs_argop4 *argop;
5217 5221          int doqueue;
5218 5222          mntinfo4_t *mi;
5219 5223          nfs4_recov_state_t recov_state;
5220 5224          hrtime_t t;
5221 5225          int isdotdot;
5222 5226          vnode_t *nvp;
5223 5227          nfs_fh4 *fhp;
5224 5228          nfs4_sharedfh_t *sfhp;
5225 5229          nfs4_access_type_t cacc;
5226 5230          rnode4_t *nrp;
5227 5231          rnode4_t *drp = VTOR4(dvp);
5228 5232          nfs4_ga_res_t *garp = NULL;
5229 5233          nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
5230 5234  
5231 5235          ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
5232 5236          ASSERT(nm != NULL);
5233 5237          ASSERT(nm[0] != '\0');
5234 5238          ASSERT(dvp->v_type == VDIR);
5235 5239          ASSERT(nm[0] != '.' || nm[1] != '\0');
5236 5240          ASSERT(*vpp != NULL);
5237 5241  
5238 5242          if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') {
5239 5243                  isdotdot = 1;
5240 5244                  args.ctag = TAG_LOOKUP_VPARENT;
5241 5245          } else {
5242 5246                  /*
5243 5247                   * If dvp were a stub, it should have triggered and caused
5244 5248                   * a mount for us to get this far.
5245 5249                   */
5246 5250                  ASSERT(!RP_ISSTUB(VTOR4(dvp)));
5247 5251  
5248 5252                  isdotdot = 0;
5249 5253                  args.ctag = TAG_LOOKUP_VALID;
5250 5254          }
5251 5255  
5252 5256          mi = VTOMI4(dvp);
5253 5257          recov_state.rs_flags = 0;
5254 5258          recov_state.rs_num_retry_despite_err = 0;
5255 5259  
5256 5260          nvp = NULL;
5257 5261  
5258 5262          /* Save the original mount point security information */
5259 5263          (void) save_mnt_secinfo(mi->mi_curr_serv);
5260 5264  
5261 5265  recov_retry:
5262 5266          e.error = nfs4_start_fop(mi, dvp, NULL, OH_LOOKUP,
5263 5267              &recov_state, NULL);
5264 5268          if (e.error) {
5265 5269                  (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5266 5270                  VN_RELE(*vpp);
5267 5271                  *vpp = NULL;
5268 5272                  return (e.error);
5269 5273          }
5270 5274  
5271 5275          argop = kmem_alloc(argoplist_size, KM_SLEEP);
5272 5276  
5273 5277          /* PUTFH dfh NVERIFY GETATTR ACCESS LOOKUP GETFH GETATTR */
5274 5278          args.array_len = 7;
5275 5279          args.array = argop;
5276 5280  
5277 5281          /* 0. putfh file */
5278 5282          argop[0].argop = OP_CPUTFH;
5279 5283          argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(dvp)->r_fh;
5280 5284  
5281 5285          /* 1. nverify the change info */
5282 5286          argop[1].argop = OP_NVERIFY;
5283 5287          ver_fattr = &argop[1].nfs_argop4_u.opnverify.obj_attributes;
5284 5288          ver_fattr->attrmask = FATTR4_CHANGE_MASK;
5285 5289          ver_fattr->attrlist4 = (char *)&dchange;
5286 5290          ptr = (int32_t *)&dchange;
5287 5291          IXDR_PUT_HYPER(ptr, VTOR4(dvp)->r_change);
5288 5292          ver_fattr->attrlist4_len = sizeof (fattr4_change);
5289 5293  
5290 5294          /* 2. getattr directory */
5291 5295          argop[2].argop = OP_GETATTR;
5292 5296          argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
5293 5297          argop[2].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
5294 5298  
5295 5299          /* 3. access directory */
5296 5300          argop[3].argop = OP_ACCESS;
5297 5301          argop[3].nfs_argop4_u.opaccess.access = ACCESS4_READ | ACCESS4_DELETE |
5298 5302              ACCESS4_MODIFY | ACCESS4_EXTEND | ACCESS4_LOOKUP;
5299 5303  
5300 5304          /* 4. lookup name */
5301 5305          if (isdotdot) {
5302 5306                  argop[4].argop = OP_LOOKUPP;
5303 5307          } else {
5304 5308                  argop[4].argop = OP_CLOOKUP;
5305 5309                  argop[4].nfs_argop4_u.opclookup.cname = nm;
5306 5310          }
5307 5311  
5308 5312          /* 5. resulting file handle */
5309 5313          argop[5].argop = OP_GETFH;
5310 5314  
5311 5315          /* 6. resulting file attributes */
5312 5316          argop[6].argop = OP_GETATTR;
5313 5317          argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
5314 5318          argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
5315 5319  
5316 5320          doqueue = 1;
5317 5321          t = gethrtime();
5318 5322  
5319 5323          rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e);
5320 5324  
5321 5325          if (!isdotdot && res.status == NFS4ERR_MOVED) {
5322 5326                  e.error = nfs4_setup_referral(dvp, nm, vpp, cr);
5323 5327                  if (e.error != 0 && *vpp != NULL)
5324 5328                          VN_RELE(*vpp);
5325 5329                  nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5326 5330                      &recov_state, FALSE);
5327 5331                  (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5328 5332                  kmem_free(argop, argoplist_size);
5329 5333                  return (e.error);
5330 5334          }
5331 5335  
5332 5336          if (nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp)) {
5333 5337                  /*
5334 5338                   * For WRONGSEC of a non-dotdot case, send secinfo directly
5335 5339                   * from this thread, do not go thru the recovery thread since
5336 5340                   * we need the nm information.
5337 5341                   *
5338 5342                   * Not doing dotdot case because there is no specification
5339 5343                   * for (PUTFH, SECINFO "..") yet.
5340 5344                   */
5341 5345                  if (!isdotdot && res.status == NFS4ERR_WRONGSEC) {
5342 5346                          if ((e.error = nfs4_secinfo_vnode_otw(dvp, nm, cr)))
5343 5347                                  nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5344 5348                                      &recov_state, FALSE);
5345 5349                          else
5346 5350                                  nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5347 5351                                      &recov_state, TRUE);
5348 5352                          (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5349 5353                          kmem_free(argop, argoplist_size);
5350 5354                          if (!e.error)
5351 5355                                  goto recov_retry;
5352 5356                          (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5353 5357                          VN_RELE(*vpp);
5354 5358                          *vpp = NULL;
5355 5359                          return (e.error);
5356 5360                  }
5357 5361  
5358 5362                  if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL,
5359 5363                      OP_LOOKUP, NULL, NULL, NULL) == FALSE) {
5360 5364                          nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5361 5365                              &recov_state, TRUE);
5362 5366  
5363 5367                          (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5364 5368                          kmem_free(argop, argoplist_size);
5365 5369                          goto recov_retry;
5366 5370                  }
5367 5371          }
5368 5372  
5369 5373          nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, &recov_state, FALSE);
5370 5374  
5371 5375          if (e.error || res.array_len == 0) {
5372 5376                  /*
5373 5377                   * If e.error isn't set, then reply has no ops (or we couldn't
5374 5378                   * be here).  The only legal way to reply without an op array
5375 5379                   * is via NFS4ERR_MINOR_VERS_MISMATCH.  An ops array should
5376 5380                   * be in the reply for all other status values.
5377 5381                   *
5378 5382                   * For valid replies without an ops array, return ENOTSUP
5379 5383                   * (geterrno4 xlation of VERS_MISMATCH).  For illegal replies,
5380 5384                   * return EIO -- don't trust status.
5381 5385                   */
5382 5386                  if (e.error == 0)
5383 5387                          e.error = (res.status == NFS4ERR_MINOR_VERS_MISMATCH) ?
5384 5388                              ENOTSUP : EIO;
5385 5389                  VN_RELE(*vpp);
5386 5390                  *vpp = NULL;
5387 5391                  kmem_free(argop, argoplist_size);
5388 5392                  (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5389 5393                  return (e.error);
5390 5394          }
5391 5395  
5392 5396          if (res.status != NFS4ERR_SAME) {
5393 5397                  e.error = geterrno4(res.status);
5394 5398  
5395 5399                  /*
5396 5400                   * The NVERIFY "failed" so the directory has changed
5397 5401                   * First make sure PUTFH succeeded and NVERIFY "failed"
5398 5402                   * cleanly.
5399 5403                   */
5400 5404                  if ((res.array[0].nfs_resop4_u.opputfh.status != NFS4_OK) ||
5401 5405                      (res.array[1].nfs_resop4_u.opnverify.status != NFS4_OK)) {
5402 5406                          nfs4_purge_stale_fh(e.error, dvp, cr);
5403 5407                          VN_RELE(*vpp);
5404 5408                          *vpp = NULL;
5405 5409                          goto exit;
5406 5410                  }
5407 5411  
5408 5412                  /*
5409 5413                   * We know the NVERIFY "failed" so we must:
5410 5414                   *      purge the caches (access and indirectly dnlc if needed)
5411 5415                   */
5412 5416                  nfs4_purge_caches(dvp, NFS4_NOPURGE_DNLC, cr, TRUE);
5413 5417  
5414 5418                  if (res.array[2].nfs_resop4_u.opgetattr.status != NFS4_OK) {
5415 5419                          nfs4_purge_stale_fh(e.error, dvp, cr);
5416 5420                          VN_RELE(*vpp);
5417 5421                          *vpp = NULL;
5418 5422                          goto exit;
5419 5423                  }
5420 5424  
5421 5425                  /*
5422 5426                   * Install new cached attributes for the directory
5423 5427                   */
5424 5428                  nfs4_attr_cache(dvp,
5425 5429                      &res.array[2].nfs_resop4_u.opgetattr.ga_res,
5426 5430                      t, cr, FALSE, NULL);
5427 5431  
5428 5432                  if (res.array[3].nfs_resop4_u.opaccess.status != NFS4_OK) {
5429 5433                          nfs4_purge_stale_fh(e.error, dvp, cr);
5430 5434                          VN_RELE(*vpp);
5431 5435                          *vpp = NULL;
5432 5436                          e.error = geterrno4(res.status);
5433 5437                          goto exit;
5434 5438                  }
5435 5439  
5436 5440                  /*
5437 5441                   * Now we know the directory is valid,
5438 5442                   * cache new directory access
5439 5443                   */
5440 5444                  nfs4_access_cache(drp,
5441 5445                      args.array[3].nfs_argop4_u.opaccess.access,
5442 5446                      res.array[3].nfs_resop4_u.opaccess.access, cr);
5443 5447  
5444 5448                  /*
5445 5449                   * recheck VEXEC access
5446 5450                   */
5447 5451                  cacc = nfs4_access_check(drp, ACCESS4_LOOKUP, cr);
5448 5452                  if (cacc != NFS4_ACCESS_ALLOWED) {
5449 5453                          /*
5450 5454                           * Directory permissions might have been revoked
5451 5455                           */
5452 5456                          if (cacc == NFS4_ACCESS_DENIED) {
5453 5457                                  e.error = EACCES;
5454 5458                                  VN_RELE(*vpp);
5455 5459                                  *vpp = NULL;
5456 5460                                  goto exit;
5457 5461                          }
5458 5462  
5459 5463                          /*
5460 5464                           * Somehow we must not have asked for enough
5461 5465                           * so try a singleton ACCESS, should never happen.
5462 5466                           */
5463 5467                          e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
5464 5468                          if (e.error) {
5465 5469                                  VN_RELE(*vpp);
5466 5470                                  *vpp = NULL;
5467 5471                                  goto exit;
5468 5472                          }
5469 5473                  }
5470 5474  
5471 5475                  e.error = geterrno4(res.status);
5472 5476                  if (res.array[4].nfs_resop4_u.oplookup.status != NFS4_OK) {
5473 5477                          /*
5474 5478                           * The lookup failed, probably no entry
5475 5479                           */
5476 5480                          if (e.error == ENOENT && nfs4_lookup_neg_cache) {
5477 5481                                  dnlc_update(dvp, nm, DNLC_NO_VNODE);
5478 5482                          } else {
5479 5483                                  /*
5480 5484                                   * Might be some other error, so remove
5481 5485                                   * the dnlc entry to make sure we start all
5482 5486                                   * over again, next time.
5483 5487                                   */
5484 5488                                  dnlc_remove(dvp, nm);
5485 5489                          }
5486 5490                          VN_RELE(*vpp);
5487 5491                          *vpp = NULL;
5488 5492                          goto exit;
5489 5493                  }
5490 5494  
5491 5495                  if (res.array[5].nfs_resop4_u.opgetfh.status != NFS4_OK) {
5492 5496                          /*
5493 5497                           * The file exists but we can't get its fh for
5494 5498                           * some unknown reason.  Remove it from the dnlc
5495 5499                           * and error out to be safe.
5496 5500                           */
5497 5501                          dnlc_remove(dvp, nm);
5498 5502                          VN_RELE(*vpp);
5499 5503                          *vpp = NULL;
5500 5504                          goto exit;
5501 5505                  }
5502 5506                  fhp = &res.array[5].nfs_resop4_u.opgetfh.object;
5503 5507                  if (fhp->nfs_fh4_len == 0) {
5504 5508                          /*
5505 5509                           * The file exists but a bogus fh
5506 5510                           * some unknown reason.  Remove it from the dnlc
5507 5511                           * and error out to be safe.
5508 5512                           */
5509 5513                          e.error = ENOENT;
5510 5514                          dnlc_remove(dvp, nm);
5511 5515                          VN_RELE(*vpp);
5512 5516                          *vpp = NULL;
5513 5517                          goto exit;
5514 5518                  }
5515 5519                  sfhp = sfh4_get(fhp, mi);
5516 5520  
5517 5521                  if (res.array[6].nfs_resop4_u.opgetattr.status == NFS4_OK)
5518 5522                          garp = &res.array[6].nfs_resop4_u.opgetattr.ga_res;
5519 5523  
5520 5524                  /*
5521 5525                   * Make the new rnode
5522 5526                   */
5523 5527                  if (isdotdot) {
5524 5528                          e.error = nfs4_make_dotdot(sfhp, t, dvp, cr, &nvp, 1);
5525 5529                          if (e.error) {
5526 5530                                  sfh4_rele(&sfhp);
5527 5531                                  VN_RELE(*vpp);
5528 5532                                  *vpp = NULL;
5529 5533                                  goto exit;
5530 5534                          }
5531 5535                          /*
5532 5536                           * XXX if nfs4_make_dotdot uses an existing rnode
5533 5537                           * XXX it doesn't update the attributes.
5534 5538                           * XXX for now just save them again to save an OTW
5535 5539                           */
5536 5540                          nfs4_attr_cache(nvp, garp, t, cr, FALSE, NULL);
5537 5541                  } else {
5538 5542                          nvp = makenfs4node(sfhp, garp, dvp->v_vfsp, t, cr,
5539 5543                              dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp));
5540 5544                          /*
5541 5545                           * If v_type == VNON, then garp was NULL because
5542 5546                           * the last op in the compound failed and makenfs4node
5543 5547                           * could not find the vnode for sfhp. It created
5544 5548                           * a new vnode, so we have nothing to purge here.
5545 5549                           */
5546 5550                          if (nvp->v_type == VNON) {
5547 5551                                  vattr_t vattr;
5548 5552  
5549 5553                                  vattr.va_mask = AT_TYPE;
5550 5554                                  /*
5551 5555                                   * N.B. We've already called nfs4_end_fop above.
5552 5556                                   */
5553 5557                                  e.error = nfs4getattr(nvp, &vattr, cr);
5554 5558                                  if (e.error) {
5555 5559                                          sfh4_rele(&sfhp);
5556 5560                                          VN_RELE(*vpp);
5557 5561                                          *vpp = NULL;
5558 5562                                          VN_RELE(nvp);
5559 5563                                          goto exit;
5560 5564                                  }
5561 5565                                  nvp->v_type = vattr.va_type;
5562 5566                          }
5563 5567                  }
5564 5568                  sfh4_rele(&sfhp);
5565 5569  
5566 5570                  nrp = VTOR4(nvp);
5567 5571                  mutex_enter(&nrp->r_statev4_lock);
5568 5572                  if (!nrp->created_v4) {
5569 5573                          mutex_exit(&nrp->r_statev4_lock);
5570 5574                          dnlc_update(dvp, nm, nvp);
5571 5575                  } else
5572 5576                          mutex_exit(&nrp->r_statev4_lock);
5573 5577  
5574 5578                  VN_RELE(*vpp);
5575 5579                  *vpp = nvp;
5576 5580          } else {
5577 5581                  hrtime_t now;
5578 5582                  hrtime_t delta = 0;
5579 5583  
5580 5584                  e.error = 0;
5581 5585  
5582 5586                  /*
5583 5587                   * Because the NVERIFY "succeeded" we know that the
5584 5588                   * directory attributes are still valid
5585 5589                   * so update r_time_attr_inval
5586 5590                   */
5587 5591                  now = gethrtime();
5588 5592                  mutex_enter(&drp->r_statelock);
5589 5593                  if (!(mi->mi_flags & MI4_NOAC) && !(dvp->v_flag & VNOCACHE)) {
5590 5594                          delta = now - drp->r_time_attr_saved;
5591 5595                          if (delta < mi->mi_acdirmin)
5592 5596                                  delta = mi->mi_acdirmin;
5593 5597                          else if (delta > mi->mi_acdirmax)
5594 5598                                  delta = mi->mi_acdirmax;
5595 5599                  }
5596 5600                  drp->r_time_attr_inval = now + delta;
5597 5601                  mutex_exit(&drp->r_statelock);
5598 5602                  dnlc_update(dvp, nm, *vpp);
5599 5603  
5600 5604                  /*
5601 5605                   * Even though we have a valid directory attr cache
5602 5606                   * and dnlc entry, we may not have access.
5603 5607                   * This should almost always hit the cache.
5604 5608                   */
5605 5609                  e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
5606 5610                  if (e.error) {
5607 5611                          VN_RELE(*vpp);
5608 5612                          *vpp = NULL;
5609 5613                  }
5610 5614  
5611 5615                  if (*vpp == DNLC_NO_VNODE) {
5612 5616                          VN_RELE(*vpp);
5613 5617                          *vpp = NULL;
5614 5618                          e.error = ENOENT;
5615 5619                  }
5616 5620          }
5617 5621  
5618 5622  exit:
5619 5623          (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5620 5624          kmem_free(argop, argoplist_size);
5621 5625          (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5622 5626          return (e.error);
5623 5627  }
5624 5628  
5625 5629  /*
5626 5630   * We need to go over the wire to lookup the name, but
5627 5631   * while we are there verify the directory has not
5628 5632   * changed but if it has, get new attributes and check access
5629 5633   *
5630 5634   * PUTFH dfh SAVEFH LOOKUP nm GETFH GETATTR RESTOREFH
5631 5635   *                                      NVERIFY GETATTR ACCESS
5632 5636   *
5633 5637   * With the results:
5634 5638   *      if the NVERIFY failed we must purge the caches, add new attributes,
5635 5639   *              and cache new access.
5636 5640   *      set a new r_time_attr_inval
5637 5641   *      add name to dnlc, possibly negative
5638 5642   *      if LOOKUP succeeded
5639 5643   *              cache new attributes
5640 5644   */
5641 5645  static int
5642 5646  nfs4lookupnew_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr)
5643 5647  {
5644 5648          COMPOUND4args_clnt args;
5645 5649          COMPOUND4res_clnt res;
5646 5650          fattr4 *ver_fattr;
5647 5651          fattr4_change dchange;
5648 5652          int32_t *ptr;
5649 5653          nfs4_ga_res_t *garp = NULL;
5650 5654          int argoplist_size  = 9 * sizeof (nfs_argop4);
5651 5655          nfs_argop4 *argop;
5652 5656          int doqueue;
5653 5657          mntinfo4_t *mi;
5654 5658          nfs4_recov_state_t recov_state;
5655 5659          hrtime_t t;
5656 5660          int isdotdot;
5657 5661          vnode_t *nvp;
5658 5662          nfs_fh4 *fhp;
5659 5663          nfs4_sharedfh_t *sfhp;
5660 5664          nfs4_access_type_t cacc;
5661 5665          rnode4_t *nrp;
5662 5666          rnode4_t *drp = VTOR4(dvp);
5663 5667          nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
5664 5668  
5665 5669          ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
5666 5670          ASSERT(nm != NULL);
5667 5671          ASSERT(nm[0] != '\0');
5668 5672          ASSERT(dvp->v_type == VDIR);
5669 5673          ASSERT(nm[0] != '.' || nm[1] != '\0');
5670 5674          ASSERT(*vpp == NULL);
5671 5675  
5672 5676          if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') {
5673 5677                  isdotdot = 1;
5674 5678                  args.ctag = TAG_LOOKUP_PARENT;
5675 5679          } else {
5676 5680                  /*
5677 5681                   * If dvp were a stub, it should have triggered and caused
5678 5682                   * a mount for us to get this far.
5679 5683                   */
5680 5684                  ASSERT(!RP_ISSTUB(VTOR4(dvp)));
5681 5685  
5682 5686                  isdotdot = 0;
5683 5687                  args.ctag = TAG_LOOKUP;
5684 5688          }
5685 5689  
5686 5690          mi = VTOMI4(dvp);
5687 5691          recov_state.rs_flags = 0;
5688 5692          recov_state.rs_num_retry_despite_err = 0;
5689 5693  
5690 5694          nvp = NULL;
5691 5695  
5692 5696          /* Save the original mount point security information */
5693 5697          (void) save_mnt_secinfo(mi->mi_curr_serv);
5694 5698  
5695 5699  recov_retry:
5696 5700          e.error = nfs4_start_fop(mi, dvp, NULL, OH_LOOKUP,
5697 5701              &recov_state, NULL);
5698 5702          if (e.error) {
5699 5703                  (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5700 5704                  return (e.error);
5701 5705          }
5702 5706  
5703 5707          argop = kmem_alloc(argoplist_size, KM_SLEEP);
5704 5708  
5705 5709          /* PUTFH SAVEFH LOOKUP GETFH GETATTR RESTOREFH NVERIFY GETATTR ACCESS */
5706 5710          args.array_len = 9;
5707 5711          args.array = argop;
5708 5712  
5709 5713          /* 0. putfh file */
5710 5714          argop[0].argop = OP_CPUTFH;
5711 5715          argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(dvp)->r_fh;
5712 5716  
5713 5717          /* 1. savefh for the nverify */
5714 5718          argop[1].argop = OP_SAVEFH;
5715 5719  
5716 5720          /* 2. lookup name */
5717 5721          if (isdotdot) {
5718 5722                  argop[2].argop = OP_LOOKUPP;
5719 5723          } else {
5720 5724                  argop[2].argop = OP_CLOOKUP;
5721 5725                  argop[2].nfs_argop4_u.opclookup.cname = nm;
5722 5726          }
5723 5727  
5724 5728          /* 3. resulting file handle */
5725 5729          argop[3].argop = OP_GETFH;
5726 5730  
5727 5731          /* 4. resulting file attributes */
5728 5732          argop[4].argop = OP_GETATTR;
5729 5733          argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
5730 5734          argop[4].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
5731 5735  
5732 5736          /* 5. restorefh back the directory for the nverify */
5733 5737          argop[5].argop = OP_RESTOREFH;
5734 5738  
5735 5739          /* 6. nverify the change info */
5736 5740          argop[6].argop = OP_NVERIFY;
5737 5741          ver_fattr = &argop[6].nfs_argop4_u.opnverify.obj_attributes;
5738 5742          ver_fattr->attrmask = FATTR4_CHANGE_MASK;
5739 5743          ver_fattr->attrlist4 = (char *)&dchange;
5740 5744          ptr = (int32_t *)&dchange;
5741 5745          IXDR_PUT_HYPER(ptr, VTOR4(dvp)->r_change);
5742 5746          ver_fattr->attrlist4_len = sizeof (fattr4_change);
5743 5747  
5744 5748          /* 7. getattr directory */
5745 5749          argop[7].argop = OP_GETATTR;
5746 5750          argop[7].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
5747 5751          argop[7].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
5748 5752  
5749 5753          /* 8. access directory */
5750 5754          argop[8].argop = OP_ACCESS;
5751 5755          argop[8].nfs_argop4_u.opaccess.access = ACCESS4_READ | ACCESS4_DELETE |
5752 5756              ACCESS4_MODIFY | ACCESS4_EXTEND | ACCESS4_LOOKUP;
5753 5757  
5754 5758          doqueue = 1;
5755 5759          t = gethrtime();
5756 5760  
5757 5761          rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e);
5758 5762  
5759 5763          if (!isdotdot && res.status == NFS4ERR_MOVED) {
5760 5764                  e.error = nfs4_setup_referral(dvp, nm, vpp, cr);
5761 5765                  if (e.error != 0 && *vpp != NULL)
5762 5766                          VN_RELE(*vpp);
5763 5767                  nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5764 5768                      &recov_state, FALSE);
5765 5769                  (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5766 5770                  kmem_free(argop, argoplist_size);
5767 5771                  return (e.error);
5768 5772          }
5769 5773  
5770 5774          if (nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp)) {
5771 5775                  /*
5772 5776                   * For WRONGSEC of a non-dotdot case, send secinfo directly
5773 5777                   * from this thread, do not go thru the recovery thread since
5774 5778                   * we need the nm information.
5775 5779                   *
5776 5780                   * Not doing dotdot case because there is no specification
5777 5781                   * for (PUTFH, SECINFO "..") yet.
5778 5782                   */
5779 5783                  if (!isdotdot && res.status == NFS4ERR_WRONGSEC) {
5780 5784                          if ((e.error = nfs4_secinfo_vnode_otw(dvp, nm, cr)))
5781 5785                                  nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5782 5786                                      &recov_state, FALSE);
5783 5787                          else
5784 5788                                  nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5785 5789                                      &recov_state, TRUE);
5786 5790                          (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5787 5791                          kmem_free(argop, argoplist_size);
5788 5792                          if (!e.error)
5789 5793                                  goto recov_retry;
5790 5794                          (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5791 5795                          return (e.error);
5792 5796                  }
5793 5797  
5794 5798                  if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL,
5795 5799                      OP_LOOKUP, NULL, NULL, NULL) == FALSE) {
5796 5800                          nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5797 5801                              &recov_state, TRUE);
5798 5802  
5799 5803                          (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5800 5804                          kmem_free(argop, argoplist_size);
5801 5805                          goto recov_retry;
5802 5806                  }
5803 5807          }
5804 5808  
5805 5809          nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, &recov_state, FALSE);
5806 5810  
5807 5811          if (e.error || res.array_len == 0) {
5808 5812                  /*
5809 5813                   * If e.error isn't set, then reply has no ops (or we couldn't
5810 5814                   * be here).  The only legal way to reply without an op array
5811 5815                   * is via NFS4ERR_MINOR_VERS_MISMATCH.  An ops array should
5812 5816                   * be in the reply for all other status values.
5813 5817                   *
5814 5818                   * For valid replies without an ops array, return ENOTSUP
5815 5819                   * (geterrno4 xlation of VERS_MISMATCH).  For illegal replies,
5816 5820                   * return EIO -- don't trust status.
5817 5821                   */
5818 5822                  if (e.error == 0)
5819 5823                          e.error = (res.status == NFS4ERR_MINOR_VERS_MISMATCH) ?
5820 5824                              ENOTSUP : EIO;
5821 5825  
5822 5826                  kmem_free(argop, argoplist_size);
5823 5827                  (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5824 5828                  return (e.error);
5825 5829          }
5826 5830  
5827 5831          e.error = geterrno4(res.status);
5828 5832  
5829 5833          /*
5830 5834           * The PUTFH and SAVEFH may have failed.
5831 5835           */
5832 5836          if ((res.array[0].nfs_resop4_u.opputfh.status != NFS4_OK) ||
5833 5837              (res.array[1].nfs_resop4_u.opsavefh.status != NFS4_OK)) {
5834 5838                  nfs4_purge_stale_fh(e.error, dvp, cr);
5835 5839                  goto exit;
5836 5840          }
5837 5841  
5838 5842          /*
5839 5843           * Check if the file exists, if it does delay entering
5840 5844           * into the dnlc until after we update the directory
5841 5845           * attributes so we don't cause it to get purged immediately.
5842 5846           */
5843 5847          if (res.array[2].nfs_resop4_u.oplookup.status != NFS4_OK) {
5844 5848                  /*
5845 5849                   * The lookup failed, probably no entry
5846 5850                   */
5847 5851                  if (e.error == ENOENT && nfs4_lookup_neg_cache)
5848 5852                          dnlc_update(dvp, nm, DNLC_NO_VNODE);
5849 5853                  goto exit;
5850 5854          }
5851 5855  
5852 5856          if (res.array[3].nfs_resop4_u.opgetfh.status != NFS4_OK) {
5853 5857                  /*
5854 5858                   * The file exists but we can't get its fh for
5855 5859                   * some unknown reason. Error out to be safe.
5856 5860                   */
5857 5861                  goto exit;
5858 5862          }
5859 5863  
5860 5864          fhp = &res.array[3].nfs_resop4_u.opgetfh.object;
5861 5865          if (fhp->nfs_fh4_len == 0) {
5862 5866                  /*
5863 5867                   * The file exists but a bogus fh
5864 5868                   * some unknown reason.  Error out to be safe.
5865 5869                   */
5866 5870                  e.error = EIO;
5867 5871                  goto exit;
5868 5872          }
5869 5873          sfhp = sfh4_get(fhp, mi);
5870 5874  
5871 5875          if (res.array[4].nfs_resop4_u.opgetattr.status != NFS4_OK) {
5872 5876                  sfh4_rele(&sfhp);
5873 5877                  goto exit;
5874 5878          }
5875 5879          garp = &res.array[4].nfs_resop4_u.opgetattr.ga_res;
5876 5880  
5877 5881          /*
5878 5882           * The RESTOREFH may have failed
5879 5883           */
5880 5884          if (res.array[5].nfs_resop4_u.oprestorefh.status != NFS4_OK) {
5881 5885                  sfh4_rele(&sfhp);
5882 5886                  e.error = EIO;
5883 5887                  goto exit;
5884 5888          }
5885 5889  
5886 5890          if (res.array[6].nfs_resop4_u.opnverify.status != NFS4ERR_SAME) {
5887 5891                  /*
5888 5892                   * First make sure the NVERIFY failed as we expected,
5889 5893                   * if it didn't then be conservative and error out
5890 5894                   * as we can't trust the directory.
5891 5895                   */
5892 5896                  if (res.array[6].nfs_resop4_u.opnverify.status != NFS4_OK) {
5893 5897                          sfh4_rele(&sfhp);
5894 5898                          e.error = EIO;
5895 5899                          goto exit;
5896 5900                  }
5897 5901  
5898 5902                  /*
5899 5903                   * We know the NVERIFY "failed" so the directory has changed,
5900 5904                   * so we must:
5901 5905                   *      purge the caches (access and indirectly dnlc if needed)
5902 5906                   */
5903 5907                  nfs4_purge_caches(dvp, NFS4_NOPURGE_DNLC, cr, TRUE);
5904 5908  
5905 5909                  if (res.array[7].nfs_resop4_u.opgetattr.status != NFS4_OK) {
5906 5910                          sfh4_rele(&sfhp);
5907 5911                          goto exit;
5908 5912                  }
5909 5913                  nfs4_attr_cache(dvp,
5910 5914                      &res.array[7].nfs_resop4_u.opgetattr.ga_res,
5911 5915                      t, cr, FALSE, NULL);
5912 5916  
5913 5917                  if (res.array[8].nfs_resop4_u.opaccess.status != NFS4_OK) {
5914 5918                          nfs4_purge_stale_fh(e.error, dvp, cr);
5915 5919                          sfh4_rele(&sfhp);
5916 5920                          e.error = geterrno4(res.status);
5917 5921                          goto exit;
5918 5922                  }
5919 5923  
5920 5924                  /*
5921 5925                   * Now we know the directory is valid,
5922 5926                   * cache new directory access
5923 5927                   */
5924 5928                  nfs4_access_cache(drp,
5925 5929                      args.array[8].nfs_argop4_u.opaccess.access,
5926 5930                      res.array[8].nfs_resop4_u.opaccess.access, cr);
5927 5931  
5928 5932                  /*
5929 5933                   * recheck VEXEC access
5930 5934                   */
5931 5935                  cacc = nfs4_access_check(drp, ACCESS4_LOOKUP, cr);
5932 5936                  if (cacc != NFS4_ACCESS_ALLOWED) {
5933 5937                          /*
5934 5938                           * Directory permissions might have been revoked
5935 5939                           */
5936 5940                          if (cacc == NFS4_ACCESS_DENIED) {
5937 5941                                  sfh4_rele(&sfhp);
5938 5942                                  e.error = EACCES;
5939 5943                                  goto exit;
5940 5944                          }
5941 5945  
5942 5946                          /*
5943 5947                           * Somehow we must not have asked for enough
5944 5948                           * so try a singleton ACCESS should never happen
5945 5949                           */
5946 5950                          e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
5947 5951                          if (e.error) {
5948 5952                                  sfh4_rele(&sfhp);
5949 5953                                  goto exit;
5950 5954                          }
5951 5955                  }
5952 5956  
5953 5957                  e.error = geterrno4(res.status);
5954 5958          } else {
5955 5959                  hrtime_t now;
5956 5960                  hrtime_t delta = 0;
5957 5961  
5958 5962                  e.error = 0;
5959 5963  
5960 5964                  /*
5961 5965                   * Because the NVERIFY "succeeded" we know that the
5962 5966                   * directory attributes are still valid
5963 5967                   * so update r_time_attr_inval
5964 5968                   */
5965 5969                  now = gethrtime();
5966 5970                  mutex_enter(&drp->r_statelock);
5967 5971                  if (!(mi->mi_flags & MI4_NOAC) && !(dvp->v_flag & VNOCACHE)) {
5968 5972                          delta = now - drp->r_time_attr_saved;
5969 5973                          if (delta < mi->mi_acdirmin)
5970 5974                                  delta = mi->mi_acdirmin;
5971 5975                          else if (delta > mi->mi_acdirmax)
5972 5976                                  delta = mi->mi_acdirmax;
5973 5977                  }
5974 5978                  drp->r_time_attr_inval = now + delta;
5975 5979                  mutex_exit(&drp->r_statelock);
5976 5980  
5977 5981                  /*
5978 5982                   * Even though we have a valid directory attr cache,
5979 5983                   * we may not have access.
5980 5984                   * This should almost always hit the cache.
5981 5985                   */
5982 5986                  e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
5983 5987                  if (e.error) {
5984 5988                          sfh4_rele(&sfhp);
5985 5989                          goto exit;
5986 5990                  }
5987 5991          }
5988 5992  
5989 5993          /*
5990 5994           * Now we have successfully completed the lookup, if the
5991 5995           * directory has changed we now have the valid attributes.
5992 5996           * We also know we have directory access.
5993 5997           * Create the new rnode and insert it in the dnlc.
5994 5998           */
5995 5999          if (isdotdot) {
5996 6000                  e.error = nfs4_make_dotdot(sfhp, t, dvp, cr, &nvp, 1);
5997 6001                  if (e.error) {
5998 6002                          sfh4_rele(&sfhp);
5999 6003                          goto exit;
6000 6004                  }
6001 6005                  /*
6002 6006                   * XXX if nfs4_make_dotdot uses an existing rnode
6003 6007                   * XXX it doesn't update the attributes.
6004 6008                   * XXX for now just save them again to save an OTW
6005 6009                   */
6006 6010                  nfs4_attr_cache(nvp, garp, t, cr, FALSE, NULL);
6007 6011          } else {
6008 6012                  nvp = makenfs4node(sfhp, garp, dvp->v_vfsp, t, cr,
6009 6013                      dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp));
6010 6014          }
6011 6015          sfh4_rele(&sfhp);
6012 6016  
6013 6017          nrp = VTOR4(nvp);
6014 6018          mutex_enter(&nrp->r_statev4_lock);
6015 6019          if (!nrp->created_v4) {
6016 6020                  mutex_exit(&nrp->r_statev4_lock);
6017 6021                  dnlc_update(dvp, nm, nvp);
6018 6022          } else
6019 6023                  mutex_exit(&nrp->r_statev4_lock);
6020 6024  
6021 6025          *vpp = nvp;
6022 6026  
6023 6027  exit:
6024 6028          (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
6025 6029          kmem_free(argop, argoplist_size);
6026 6030          (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
6027 6031          return (e.error);
6028 6032  }
6029 6033  
6030 6034  #ifdef DEBUG
6031 6035  void
6032 6036  nfs4lookup_dump_compound(char *where, nfs_argop4 *argbase, int argcnt)
6033 6037  {
6034 6038          uint_t i, len;
6035 6039          zoneid_t zoneid = getzoneid();
6036 6040          char *s;
6037 6041  
6038 6042          zcmn_err(zoneid, CE_NOTE, "%s: dumping cmpd", where);
6039 6043          for (i = 0; i < argcnt; i++) {
6040 6044                  nfs_argop4 *op = &argbase[i];
6041 6045                  switch (op->argop) {
6042 6046                  case OP_CPUTFH:
6043 6047                  case OP_PUTFH:
6044 6048                          zcmn_err(zoneid, CE_NOTE, "\t op %d, putfh", i);
6045 6049                          break;
6046 6050                  case OP_PUTROOTFH:
6047 6051                          zcmn_err(zoneid, CE_NOTE, "\t op %d, putrootfh", i);
6048 6052                          break;
6049 6053                  case OP_CLOOKUP:
6050 6054                          s = op->nfs_argop4_u.opclookup.cname;
6051 6055                          zcmn_err(zoneid, CE_NOTE, "\t op %d, lookup %s", i, s);
6052 6056                          break;
6053 6057                  case OP_LOOKUP:
6054 6058                          s = utf8_to_str(&op->nfs_argop4_u.oplookup.objname,
6055 6059                              &len, NULL);
6056 6060                          zcmn_err(zoneid, CE_NOTE, "\t op %d, lookup %s", i, s);
6057 6061                          kmem_free(s, len);
6058 6062                          break;
6059 6063                  case OP_LOOKUPP:
6060 6064                          zcmn_err(zoneid, CE_NOTE, "\t op %d, lookupp ..", i);
6061 6065                          break;
6062 6066                  case OP_GETFH:
6063 6067                          zcmn_err(zoneid, CE_NOTE, "\t op %d, getfh", i);
6064 6068                          break;
6065 6069                  case OP_GETATTR:
6066 6070                          zcmn_err(zoneid, CE_NOTE, "\t op %d, getattr", i);
6067 6071                          break;
6068 6072                  case OP_OPENATTR:
6069 6073                          zcmn_err(zoneid, CE_NOTE, "\t op %d, openattr", i);
6070 6074                          break;
6071 6075                  default:
6072 6076                          zcmn_err(zoneid, CE_NOTE, "\t op %d, opcode %d", i,
6073 6077                              op->argop);
6074 6078                          break;
6075 6079                  }
6076 6080          }
6077 6081  }
6078 6082  #endif
6079 6083  
6080 6084  /*
6081 6085   * nfs4lookup_setup - constructs a multi-lookup compound request.
6082 6086   *
6083 6087   * Given the path "nm1/nm2/.../nmn", the following compound requests
6084 6088   * may be created:
6085 6089   *
6086 6090   * Note: Getfh is not be needed because filehandle attr is mandatory, but it
6087 6091   * is faster, for now.
6088 6092   *
6089 6093   * l4_getattrs indicates the type of compound requested.
6090 6094   *
6091 6095   * LKP4_NO_ATTRIBUTE - no attributes (used by secinfo):
6092 6096   *
6093 6097   *      compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ...  Lookup {nmn} }
6094 6098   *
6095 6099   *   total number of ops is n + 1.
6096 6100   *
6097 6101   * LKP4_LAST_NAMED_ATTR - multi-component path for a named
6098 6102   *      attribute: create lookups plus one OPENATTR/GETFH/GETATTR
6099 6103   *      before the last component, and only get attributes
6100 6104   *      for the last component.  Note that the second-to-last
6101 6105   *      pathname component is XATTR_RPATH, which does NOT go
6102 6106   *      over-the-wire as a lookup.
6103 6107   *
6104 6108   *      compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Lookup {nmn-2};
6105 6109   *              Openattr; Getfh; Getattr; Lookup {nmn}; Getfh; Getattr }
6106 6110   *
6107 6111   *   and total number of ops is n + 5.
6108 6112   *
6109 6113   * LKP4_LAST_ATTRDIR - multi-component path for the hidden named
6110 6114   *      attribute directory: create lookups plus an OPENATTR
6111 6115   *      replacing the last lookup.  Note that the last pathname
6112 6116   *      component is XATTR_RPATH, which does NOT go over-the-wire
6113 6117   *      as a lookup.
6114 6118   *
6115 6119   *      compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Getfh; Getattr;
6116 6120   *              Openattr; Getfh; Getattr }
6117 6121   *
6118 6122   *   and total number of ops is n + 5.
6119 6123   *
6120 6124   * LKP4_ALL_ATTRIBUTES - create lookups and get attributes for intermediate
6121 6125   *      nodes too.
6122 6126   *
6123 6127   *      compound { Put*fh; Lookup {nm1}; Getfh; Getattr;
6124 6128   *              Lookup {nm2}; ...  Lookup {nmn}; Getfh; Getattr }
6125 6129   *
6126 6130   *   and total number of ops is 3*n + 1.
6127 6131   *
6128 6132   * All cases: returns the index in the arg array of the final LOOKUP op, or
6129 6133   * -1 if no LOOKUPs were used.
6130 6134   */
6131 6135  int
6132 6136  nfs4lookup_setup(char *nm, lookup4_param_t *lookupargp, int needgetfh)
6133 6137  {
6134 6138          enum lkp4_attr_setup l4_getattrs = lookupargp->l4_getattrs;
6135 6139          nfs_argop4 *argbase, *argop;
6136 6140          int arglen, argcnt;
6137 6141          int n = 1;      /* number of components */
6138 6142          int nga = 1;    /* number of Getattr's in request */
6139 6143          char c = '\0', *s, *p;
6140 6144          int lookup_idx = -1;
6141 6145          int argoplist_size;
6142 6146  
6143 6147          /* set lookuparg response result to 0 */
6144 6148          lookupargp->resp->status = NFS4_OK;
6145 6149  
6146 6150          /* skip leading "/" or "." e.g. ".//./" if there is */
6147 6151          for (; ; nm++) {
6148 6152                  if (*nm != '/' && *nm != '.')
6149 6153                          break;
6150 6154  
6151 6155                  /* ".." is counted as 1 component */
6152 6156                  if (*nm == '.' && *(nm + 1) != '/')
6153 6157                          break;
6154 6158          }
6155 6159  
6156 6160          /*
6157 6161           * Find n = number of components - nm must be null terminated
6158 6162           * Skip "." components.
6159 6163           */
6160 6164          if (*nm != '\0')
6161 6165                  for (n = 1, s = nm; *s != '\0'; s++) {
6162 6166                          if ((*s == '/') && (*(s + 1) != '/') &&
6163 6167                              (*(s + 1) != '\0') &&
6164 6168                              !(*(s + 1) == '.' && (*(s + 2) == '/' ||
6165 6169                              *(s + 2) == '\0')))
6166 6170                                  n++;
6167 6171                  }
6168 6172          else
6169 6173                  n = 0;
6170 6174  
6171 6175          /*
6172 6176           * nga is number of components that need Getfh+Getattr
6173 6177           */
6174 6178          switch (l4_getattrs) {
6175 6179          case LKP4_NO_ATTRIBUTES:
6176 6180                  nga = 0;
6177 6181                  break;
6178 6182          case LKP4_ALL_ATTRIBUTES:
6179 6183                  nga = n;
6180 6184                  /*
6181 6185                   * Always have at least 1 getfh, getattr pair
6182 6186                   */
6183 6187                  if (nga == 0)
6184 6188                          nga++;
6185 6189                  break;
6186 6190          case LKP4_LAST_ATTRDIR:
6187 6191          case LKP4_LAST_NAMED_ATTR:
6188 6192                  nga = n+1;
6189 6193                  break;
6190 6194          }
6191 6195  
6192 6196          /*
6193 6197           * If change to use the filehandle attr instead of getfh
6194 6198           * the following line can be deleted.
6195 6199           */
6196 6200          nga *= 2;
6197 6201  
6198 6202          /*
6199 6203           * calculate number of ops in request as
6200 6204           * header + trailer + lookups + getattrs
6201 6205           */
6202 6206          arglen = lookupargp->header_len + lookupargp->trailer_len + n + nga;
6203 6207  
6204 6208          argoplist_size = arglen * sizeof (nfs_argop4);
6205 6209          argop = argbase = kmem_alloc(argoplist_size, KM_SLEEP);
6206 6210          lookupargp->argsp->array = argop;
6207 6211  
6208 6212          argcnt = lookupargp->header_len;
6209 6213          argop += argcnt;
6210 6214  
6211 6215          /*
6212 6216           * loop and create a lookup op and possibly getattr/getfh for
6213 6217           * each component. Skip "." components.
6214 6218           */
6215 6219          for (s = nm; *s != '\0'; s = p) {
6216 6220                  /*
6217 6221                   * Set up a pathname struct for each component if needed
6218 6222                   */
6219 6223                  while (*s == '/')
6220 6224                          s++;
6221 6225                  if (*s == '\0')
6222 6226                          break;
6223 6227  
6224 6228                  for (p = s; (*p != '/') && (*p != '\0'); p++)
6225 6229                          ;
6226 6230                  c = *p;
6227 6231                  *p = '\0';
6228 6232  
6229 6233                  if (s[0] == '.' && s[1] == '\0') {
6230 6234                          *p = c;
6231 6235                          continue;
6232 6236                  }
6233 6237                  if (l4_getattrs == LKP4_LAST_ATTRDIR &&
6234 6238                      strcmp(s, XATTR_RPATH) == 0) {
6235 6239                          /* getfh XXX may not be needed in future */
6236 6240                          argop->argop = OP_GETFH;
6237 6241                          argop++;
6238 6242                          argcnt++;
6239 6243  
6240 6244                          /* getattr */
6241 6245                          argop->argop = OP_GETATTR;
6242 6246                          argop->nfs_argop4_u.opgetattr.attr_request =
6243 6247                              lookupargp->ga_bits;
6244 6248                          argop->nfs_argop4_u.opgetattr.mi =
6245 6249                              lookupargp->mi;
6246 6250                          argop++;
6247 6251                          argcnt++;
6248 6252  
6249 6253                          /* openattr */
6250 6254                          argop->argop = OP_OPENATTR;
6251 6255                  } else if (l4_getattrs == LKP4_LAST_NAMED_ATTR &&
6252 6256                      strcmp(s, XATTR_RPATH) == 0) {
6253 6257                          /* openattr */
6254 6258                          argop->argop = OP_OPENATTR;
6255 6259                          argop++;
6256 6260                          argcnt++;
6257 6261  
6258 6262                          /* getfh XXX may not be needed in future */
6259 6263                          argop->argop = OP_GETFH;
6260 6264                          argop++;
6261 6265                          argcnt++;
6262 6266  
6263 6267                          /* getattr */
6264 6268                          argop->argop = OP_GETATTR;
6265 6269                          argop->nfs_argop4_u.opgetattr.attr_request =
6266 6270                              lookupargp->ga_bits;
6267 6271                          argop->nfs_argop4_u.opgetattr.mi =
6268 6272                              lookupargp->mi;
6269 6273                          argop++;
6270 6274                          argcnt++;
6271 6275                          *p = c;
6272 6276                          continue;
6273 6277                  } else if (s[0] == '.' && s[1] == '.' && s[2] == '\0') {
6274 6278                          /* lookupp */
6275 6279                          argop->argop = OP_LOOKUPP;
6276 6280                  } else {
6277 6281                          /* lookup */
6278 6282                          argop->argop = OP_LOOKUP;
6279 6283                          (void) str_to_utf8(s,
6280 6284                              &argop->nfs_argop4_u.oplookup.objname);
6281 6285                  }
6282 6286                  lookup_idx = argcnt;
6283 6287                  argop++;
6284 6288                  argcnt++;
6285 6289  
6286 6290                  *p = c;
6287 6291  
6288 6292                  if (l4_getattrs == LKP4_ALL_ATTRIBUTES) {
6289 6293                          /* getfh XXX may not be needed in future */
6290 6294                          argop->argop = OP_GETFH;
6291 6295                          argop++;
6292 6296                          argcnt++;
6293 6297  
6294 6298                          /* getattr */
6295 6299                          argop->argop = OP_GETATTR;
6296 6300                          argop->nfs_argop4_u.opgetattr.attr_request =
6297 6301                              lookupargp->ga_bits;
6298 6302                          argop->nfs_argop4_u.opgetattr.mi =
6299 6303                              lookupargp->mi;
6300 6304                          argop++;
6301 6305                          argcnt++;
6302 6306                  }
6303 6307          }
6304 6308  
6305 6309          if ((l4_getattrs != LKP4_NO_ATTRIBUTES) &&
6306 6310              ((l4_getattrs != LKP4_ALL_ATTRIBUTES) || (lookup_idx < 0))) {
6307 6311                  if (needgetfh) {
6308 6312                          /* stick in a post-lookup getfh */
6309 6313                          argop->argop = OP_GETFH;
6310 6314                          argcnt++;
6311 6315                          argop++;
6312 6316                  }
6313 6317                  /* post-lookup getattr */
6314 6318                  argop->argop = OP_GETATTR;
6315 6319                  argop->nfs_argop4_u.opgetattr.attr_request =
6316 6320                      lookupargp->ga_bits;
6317 6321                  argop->nfs_argop4_u.opgetattr.mi = lookupargp->mi;
6318 6322                  argcnt++;
6319 6323          }
6320 6324          argcnt += lookupargp->trailer_len;      /* actual op count */
6321 6325          lookupargp->argsp->array_len = argcnt;
6322 6326          lookupargp->arglen = arglen;
6323 6327  
6324 6328  #ifdef DEBUG
6325 6329          if (nfs4_client_lookup_debug)
6326 6330                  nfs4lookup_dump_compound("nfs4lookup_setup", argbase, argcnt);
6327 6331  #endif
6328 6332  
6329 6333          return (lookup_idx);
6330 6334  }
6331 6335  
6332 6336  static int
6333 6337  nfs4openattr(vnode_t *dvp, vnode_t **avp, int cflag, cred_t *cr)
6334 6338  {
6335 6339          COMPOUND4args_clnt      args;
6336 6340          COMPOUND4res_clnt       res;
6337 6341          GETFH4res       *gf_res = NULL;
6338 6342          nfs_argop4      argop[4];
6339 6343          nfs_resop4      *resop = NULL;
6340 6344          nfs4_sharedfh_t *sfhp;
6341 6345          hrtime_t t;
6342 6346          nfs4_error_t    e;
6343 6347  
6344 6348          rnode4_t        *drp;
6345 6349          int             doqueue = 1;
6346 6350          vnode_t         *vp;
6347 6351          int             needrecov = 0;
6348 6352          nfs4_recov_state_t recov_state;
6349 6353  
6350 6354          ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
6351 6355  
6352 6356          *avp = NULL;
6353 6357          recov_state.rs_flags = 0;
6354 6358          recov_state.rs_num_retry_despite_err = 0;
6355 6359  
6356 6360  recov_retry:
6357 6361          /* COMPOUND: putfh, openattr, getfh, getattr */
6358 6362          args.array_len = 4;
6359 6363          args.array = argop;
6360 6364          args.ctag = TAG_OPENATTR;
6361 6365  
6362 6366          e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state);
6363 6367          if (e.error)
6364 6368                  return (e.error);
6365 6369  
6366 6370          drp = VTOR4(dvp);
6367 6371  
6368 6372          /* putfh */
6369 6373          argop[0].argop = OP_CPUTFH;
6370 6374          argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
6371 6375  
6372 6376          /* openattr */
6373 6377          argop[1].argop = OP_OPENATTR;
6374 6378          argop[1].nfs_argop4_u.opopenattr.createdir = (cflag ? TRUE : FALSE);
6375 6379  
6376 6380          /* getfh */
6377 6381          argop[2].argop = OP_GETFH;
6378 6382  
6379 6383          /* getattr */
6380 6384          argop[3].argop = OP_GETATTR;
6381 6385          argop[3].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
6382 6386          argop[3].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
6383 6387  
6384 6388          NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
6385 6389              "nfs4openattr: %s call, drp %s", needrecov ? "recov" : "first",
6386 6390              rnode4info(drp)));
6387 6391  
6388 6392          t = gethrtime();
6389 6393  
6390 6394          rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e);
6391 6395  
6392 6396          needrecov = nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp);
6393 6397          if (needrecov) {
6394 6398                  bool_t abort;
6395 6399  
6396 6400                  NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
6397 6401                      "nfs4openattr: initiating recovery\n"));
6398 6402  
6399 6403                  abort = nfs4_start_recovery(&e,
6400 6404                      VTOMI4(dvp), dvp, NULL, NULL, NULL,
6401 6405                      OP_OPENATTR, NULL, NULL, NULL);
6402 6406                  nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
6403 6407                  if (!e.error) {
6404 6408                          e.error = geterrno4(res.status);
6405 6409                          (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
6406 6410                  }
6407 6411                  if (abort == FALSE)
6408 6412                          goto recov_retry;
6409 6413                  return (e.error);
6410 6414          }
6411 6415  
6412 6416          if (e.error) {
6413 6417                  nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
6414 6418                  return (e.error);
6415 6419          }
6416 6420  
6417 6421          if (res.status) {
6418 6422                  /*
6419 6423                   * If OTW errro is NOTSUPP, then it should be
6420 6424                   * translated to EINVAL.  All Solaris file system
6421 6425                   * implementations return EINVAL to the syscall layer
6422 6426                   * when the attrdir cannot be created due to an
6423 6427                   * implementation restriction or noxattr mount option.
6424 6428                   */
6425 6429                  if (res.status == NFS4ERR_NOTSUPP) {
6426 6430                          mutex_enter(&drp->r_statelock);
6427 6431                          if (drp->r_xattr_dir)
6428 6432                                  VN_RELE(drp->r_xattr_dir);
6429 6433                          VN_HOLD(NFS4_XATTR_DIR_NOTSUPP);
6430 6434                          drp->r_xattr_dir = NFS4_XATTR_DIR_NOTSUPP;
6431 6435                          mutex_exit(&drp->r_statelock);
6432 6436  
6433 6437                          e.error = EINVAL;
6434 6438                  } else {
6435 6439                          e.error = geterrno4(res.status);
6436 6440                  }
6437 6441  
6438 6442                  if (e.error) {
6439 6443                          (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
6440 6444                          nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state,
6441 6445                              needrecov);
6442 6446                          return (e.error);
6443 6447                  }
6444 6448          }
6445 6449  
6446 6450          resop = &res.array[0];  /* putfh res */
6447 6451          ASSERT(resop->nfs_resop4_u.opgetfh.status == NFS4_OK);
6448 6452  
6449 6453          resop = &res.array[1];  /* openattr res */
6450 6454          ASSERT(resop->nfs_resop4_u.opopenattr.status == NFS4_OK);
6451 6455  
6452 6456          resop = &res.array[2];  /* getfh res */
6453 6457          gf_res = &resop->nfs_resop4_u.opgetfh;
6454 6458          if (gf_res->object.nfs_fh4_len == 0) {
6455 6459                  *avp = NULL;
6456 6460                  (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
6457 6461                  nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
6458 6462                  return (ENOENT);
6459 6463          }
6460 6464  
6461 6465          sfhp = sfh4_get(&gf_res->object, VTOMI4(dvp));
6462 6466          vp = makenfs4node(sfhp, &res.array[3].nfs_resop4_u.opgetattr.ga_res,
6463 6467              dvp->v_vfsp, t, cr, dvp,
6464 6468              fn_get(VTOSV(dvp)->sv_name, XATTR_RPATH, sfhp));
6465 6469          sfh4_rele(&sfhp);
6466 6470  
6467 6471          if (e.error)
6468 6472                  PURGE_ATTRCACHE4(vp);
6469 6473  
6470 6474          mutex_enter(&vp->v_lock);
6471 6475          vp->v_flag |= V_XATTRDIR;
6472 6476          mutex_exit(&vp->v_lock);
6473 6477  
6474 6478          *avp = vp;
6475 6479  
6476 6480          mutex_enter(&drp->r_statelock);
6477 6481          if (drp->r_xattr_dir)
6478 6482                  VN_RELE(drp->r_xattr_dir);
6479 6483          VN_HOLD(vp);
6480 6484          drp->r_xattr_dir = vp;
6481 6485  
6482 6486          /*
6483 6487           * Invalidate pathconf4 cache because r_xattr_dir is no longer
6484 6488           * NULL.  xattrs could be created at any time, and we have no
6485 6489           * way to update pc4_xattr_exists in the base object if/when
6486 6490           * it happens.
6487 6491           */
6488 6492          drp->r_pathconf.pc4_xattr_valid = 0;
6489 6493  
6490 6494          mutex_exit(&drp->r_statelock);
6491 6495  
6492 6496          nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
6493 6497  
6494 6498          (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
6495 6499  
6496 6500          return (0);
6497 6501  }
6498 6502  
6499 6503  /* ARGSUSED */
6500 6504  static int
6501 6505  nfs4_create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive,
6502 6506          int mode, vnode_t **vpp, cred_t *cr, int flags, caller_context_t *ct,
6503 6507          vsecattr_t *vsecp)
6504 6508  {
6505 6509          int error;
6506 6510          vnode_t *vp = NULL;
6507 6511          rnode4_t *rp;
6508 6512          struct vattr vattr;
6509 6513          rnode4_t *drp;
6510 6514          vnode_t *tempvp;
6511 6515          enum createmode4 createmode;
6512 6516          bool_t must_trunc = FALSE;
6513 6517          int     truncating = 0;
6514 6518  
6515 6519          if (nfs_zone() != VTOMI4(dvp)->mi_zone)
6516 6520                  return (EPERM);
6517 6521          if (exclusive == EXCL && (dvp->v_flag & V_XATTRDIR)) {
6518 6522                  return (EINVAL);
6519 6523          }
6520 6524  
6521 6525          /* . and .. have special meaning in the protocol, reject them. */
6522 6526  
6523 6527          if (nm[0] == '.' && (nm[1] == '\0' || (nm[1] == '.' && nm[2] == '\0')))
6524 6528                  return (EISDIR);
6525 6529  
6526 6530          drp = VTOR4(dvp);
6527 6531  
6528 6532          if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp)))
6529 6533                  return (EINTR);
6530 6534  
6531 6535  top:
6532 6536          /*
6533 6537           * We make a copy of the attributes because the caller does not
6534 6538           * expect us to change what va points to.
6535 6539           */
6536 6540          vattr = *va;
6537 6541  
6538 6542          /*
6539 6543           * If the pathname is "", then dvp is the root vnode of
6540 6544           * a remote file mounted over a local directory.
6541 6545           * All that needs to be done is access
6542 6546           * checking and truncation.  Note that we avoid doing
6543 6547           * open w/ create because the parent directory might
6544 6548           * be in pseudo-fs and the open would fail.
6545 6549           */
6546 6550          if (*nm == '\0') {
6547 6551                  error = 0;
6548 6552                  VN_HOLD(dvp);
6549 6553                  vp = dvp;
6550 6554                  must_trunc = TRUE;
6551 6555          } else {
6552 6556                  /*
6553 6557                   * We need to go over the wire, just to be sure whether the
6554 6558                   * file exists or not.  Using the DNLC can be dangerous in
6555 6559                   * this case when making a decision regarding existence.
6556 6560                   */
6557 6561                  error = nfs4lookup(dvp, nm, &vp, cr, 1);
6558 6562          }
6559 6563  
6560 6564          if (exclusive)
6561 6565                  createmode = EXCLUSIVE4;
6562 6566          else
6563 6567                  createmode = GUARDED4;
6564 6568  
6565 6569          /*
6566 6570           * error would be set if the file does not exist on the
6567 6571           * server, so lets go create it.
6568 6572           */
6569 6573          if (error) {
6570 6574                  goto create_otw;
6571 6575          }
6572 6576  
6573 6577          /*
6574 6578           * File does exist on the server
6575 6579           */
6576 6580          if (exclusive == EXCL)
6577 6581                  error = EEXIST;
6578 6582          else if (vp->v_type == VDIR && (mode & VWRITE))
6579 6583                  error = EISDIR;
6580 6584          else {
6581 6585                  /*
6582 6586                   * If vnode is a device, create special vnode.
6583 6587                   */
6584 6588                  if (ISVDEV(vp->v_type)) {
6585 6589                          tempvp = vp;
6586 6590                          vp = specvp(vp, vp->v_rdev, vp->v_type, cr);
6587 6591                          VN_RELE(tempvp);
6588 6592                  }
6589 6593                  if (!(error = VOP_ACCESS(vp, mode, 0, cr, ct))) {
6590 6594                          if ((vattr.va_mask & AT_SIZE) &&
6591 6595                              vp->v_type == VREG) {
6592 6596                                  rp = VTOR4(vp);
6593 6597                                  /*
6594 6598                                   * Check here for large file handled
6595 6599                                   * by LF-unaware process (as
6596 6600                                   * ufs_create() does)
6597 6601                                   */
6598 6602                                  if (!(flags & FOFFMAX)) {
6599 6603                                          mutex_enter(&rp->r_statelock);
6600 6604                                          if (rp->r_size > MAXOFF32_T)
6601 6605                                                  error = EOVERFLOW;
6602 6606                                          mutex_exit(&rp->r_statelock);
6603 6607                                  }
6604 6608  
6605 6609                                  /* if error is set then we need to return */
6606 6610                                  if (error) {
6607 6611                                          nfs_rw_exit(&drp->r_rwlock);
6608 6612                                          VN_RELE(vp);
6609 6613                                          return (error);
6610 6614                                  }
6611 6615  
6612 6616                                  if (must_trunc) {
6613 6617                                          vattr.va_mask = AT_SIZE;
6614 6618                                          error = nfs4setattr(vp, &vattr, 0, cr,
6615 6619                                              NULL);
6616 6620                                  } else {
6617 6621                                  /*
6618 6622                                   * we know we have a regular file that already
6619 6623                                   * exists and we may end up truncating the file
6620 6624                                   * as a result of the open_otw, so flush out
6621 6625                                   * any dirty pages for this file first.
6622 6626                                   */
6623 6627                                          if (nfs4_has_pages(vp) &&
6624 6628                                              ((rp->r_flags & R4DIRTY) ||
6625 6629                                              rp->r_count > 0 ||
6626 6630                                              rp->r_mapcnt > 0)) {
6627 6631                                                  error = nfs4_putpage(vp,
6628 6632                                                      (offset_t)0, 0, 0, cr, ct);
6629 6633                                                  if (error && (error == ENOSPC ||
6630 6634                                                      error == EDQUOT)) {
6631 6635                                                          mutex_enter(
6632 6636                                                              &rp->r_statelock);
6633 6637                                                          if (!rp->r_error)
6634 6638                                                                  rp->r_error =
6635 6639                                                                      error;
6636 6640                                                          mutex_exit(
6637 6641                                                              &rp->r_statelock);
6638 6642                                                  }
6639 6643                                          }
6640 6644                                          vattr.va_mask = (AT_SIZE |
6641 6645                                              AT_TYPE | AT_MODE);
6642 6646                                          vattr.va_type = VREG;
6643 6647                                          createmode = UNCHECKED4;
6644 6648                                          truncating = 1;
6645 6649                                          goto create_otw;

↓ open down ↓

6602 lines elided

↑ open up ↑

6646 6650                                  }
6647 6651                          }
6648 6652                  }
6649 6653          }
6650 6654          nfs_rw_exit(&drp->r_rwlock);
6651 6655          if (error) {
6652 6656                  VN_RELE(vp);
6653 6657          } else {
6654 6658                  vnode_t *tvp;
6655 6659                  rnode4_t *trp;
6656      -                /*
6657      -                 * existing file got truncated, notify.
6658      -                 */
6659 6660                  tvp = vp;
6660 6661                  if (vp->v_type == VREG) {
6661 6662                          trp = VTOR4(vp);
6662 6663                          if (IS_SHADOW(vp, trp))
6663 6664                                  tvp = RTOV4(trp);
6664 6665                  }
6665      -                vnevent_create(tvp, ct);
     6666 +
     6667 +                if (must_trunc) {
     6668 +                        /*
     6669 +                         * existing file got truncated, notify.
     6670 +                         */
     6671 +                        vnevent_create(tvp, ct);
     6672 +                }
     6673 +
6666 6674                  *vpp = vp;
6667 6675          }
6668 6676          return (error);
6669 6677  
6670 6678  create_otw:
6671 6679          dnlc_remove(dvp, nm);
6672 6680  
6673 6681          ASSERT(vattr.va_mask & AT_TYPE);
6674 6682  
6675 6683          /*

6676 6684           * If not a regular file let nfs4mknod() handle it.
6677 6685           */
6678 6686          if (vattr.va_type != VREG) {
6679 6687                  error = nfs4mknod(dvp, nm, &vattr, exclusive, mode, vpp, cr);
6680 6688                  nfs_rw_exit(&drp->r_rwlock);
6681 6689                  return (error);
6682 6690          }
6683 6691  
6684 6692          /*
6685 6693           * It _is_ a regular file.
6686 6694           */
6687 6695          ASSERT(vattr.va_mask & AT_MODE);
6688 6696          if (MANDMODE(vattr.va_mode)) {
6689 6697                  nfs_rw_exit(&drp->r_rwlock);
6690 6698                  return (EACCES);
6691 6699          }
6692 6700  
6693 6701          /*
6694 6702           * If this happens to be a mknod of a regular file, then flags will
6695 6703           * have neither FREAD or FWRITE.  However, we must set at least one
6696 6704           * for the call to nfs4open_otw.  If it's open(O_CREAT) driving
6697 6705           * nfs4_create, then either FREAD, FWRITE, or FRDWR has already been
6698 6706           * set (based on openmode specified by app).
6699 6707           */
6700 6708          if ((flags & (FREAD|FWRITE)) == 0)
6701 6709                  flags |= (FREAD|FWRITE);
6702 6710  
6703 6711          error = nfs4open_otw(dvp, nm, &vattr, vpp, cr, 1, flags, createmode, 0);
6704 6712  
6705 6713          if (vp != NULL) {
6706 6714                  /* if create was successful, throw away the file's pages */
6707 6715                  if (!error && (vattr.va_mask & AT_SIZE))
6708 6716                          nfs4_invalidate_pages(vp, (vattr.va_size & PAGEMASK),
6709 6717                              cr);
6710 6718                  /* release the lookup hold */
6711 6719                  VN_RELE(vp);
6712 6720                  vp = NULL;
6713 6721          }
6714 6722  
6715 6723          /*
6716 6724           * validate that we opened a regular file. This handles a misbehaving
6717 6725           * server that returns an incorrect FH.
6718 6726           */
6719 6727          if ((error == 0) && *vpp && (*vpp)->v_type != VREG) {
6720 6728                  error = EISDIR;
6721 6729                  VN_RELE(*vpp);
6722 6730          }
6723 6731  
6724 6732          /*
6725 6733           * If this is not an exclusive create, then the CREATE
6726 6734           * request will be made with the GUARDED mode set.  This
6727 6735           * means that the server will return EEXIST if the file
6728 6736           * exists.  The file could exist because of a retransmitted
6729 6737           * request.  In this case, we recover by starting over and
6730 6738           * checking to see whether the file exists.  This second
6731 6739           * time through it should and a CREATE request will not be
6732 6740           * sent.
6733 6741           *
6734 6742           * This handles the problem of a dangling CREATE request
6735 6743           * which contains attributes which indicate that the file
6736 6744           * should be truncated.  This retransmitted request could
6737 6745           * possibly truncate valid data in the file if not caught
6738 6746           * by the duplicate request mechanism on the server or if
6739 6747           * not caught by other means.  The scenario is:
6740 6748           *
6741 6749           * Client transmits CREATE request with size = 0
6742 6750           * Client times out, retransmits request.
6743 6751           * Response to the first request arrives from the server
6744 6752           *  and the client proceeds on.
6745 6753           * Client writes data to the file.
6746 6754           * The server now processes retransmitted CREATE request
6747 6755           *  and truncates file.
6748 6756           *
6749 6757           * The use of the GUARDED CREATE request prevents this from
6750 6758           * happening because the retransmitted CREATE would fail
6751 6759           * with EEXIST and would not truncate the file.
6752 6760           */
6753 6761          if (error == EEXIST && exclusive == NONEXCL) {
6754 6762  #ifdef DEBUG
6755 6763                  nfs4_create_misses++;
6756 6764  #endif
6757 6765                  goto top;
6758 6766          }
6759 6767          nfs_rw_exit(&drp->r_rwlock);
6760 6768          if (truncating && !error && *vpp) {
6761 6769                  vnode_t *tvp;
6762 6770                  rnode4_t *trp;
6763 6771                  /*
6764 6772                   * existing file got truncated, notify.
6765 6773                   */
6766 6774                  tvp = *vpp;
6767 6775                  trp = VTOR4(tvp);
6768 6776                  if (IS_SHADOW(tvp, trp))
6769 6777                          tvp = RTOV4(trp);
6770 6778                  vnevent_create(tvp, ct);
6771 6779          }
6772 6780          return (error);
6773 6781  }
6774 6782  
6775 6783  /*
6776 6784   * Create compound (for mkdir, mknod, symlink):
6777 6785   * { Putfh <dfh>; Create; Getfh; Getattr }
6778 6786   * It's okay if setattr failed to set gid - this is not considered
6779 6787   * an error, but purge attrs in that case.
6780 6788   */
6781 6789  static int
6782 6790  call_nfs4_create_req(vnode_t *dvp, char *nm, void *data, struct vattr *va,
6783 6791      vnode_t **vpp, cred_t *cr, nfs_ftype4 type)
6784 6792  {
6785 6793          int need_end_op = FALSE;
6786 6794          COMPOUND4args_clnt args;
6787 6795          COMPOUND4res_clnt res, *resp = NULL;
6788 6796          nfs_argop4 *argop;
6789 6797          nfs_resop4 *resop;
6790 6798          int doqueue;
6791 6799          mntinfo4_t *mi;
6792 6800          rnode4_t *drp = VTOR4(dvp);
6793 6801          change_info4 *cinfo;
6794 6802          GETFH4res *gf_res;
6795 6803          struct vattr vattr;
6796 6804          vnode_t *vp;
6797 6805          fattr4 *crattr;
6798 6806          bool_t needrecov = FALSE;
6799 6807          nfs4_recov_state_t recov_state;
6800 6808          nfs4_sharedfh_t *sfhp = NULL;
6801 6809          hrtime_t t;
6802 6810          nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
6803 6811          int numops, argoplist_size, setgid_flag, idx_create, idx_fattr;
6804 6812          dirattr_info_t dinfo, *dinfop;
6805 6813          servinfo4_t *svp;
6806 6814          bitmap4 supp_attrs;
6807 6815  
6808 6816          ASSERT(type == NF4DIR || type == NF4LNK || type == NF4BLK ||
6809 6817              type == NF4CHR || type == NF4SOCK || type == NF4FIFO);
6810 6818  
6811 6819          mi = VTOMI4(dvp);
6812 6820  
6813 6821          /*
6814 6822           * Make sure we properly deal with setting the right gid
6815 6823           * on a new directory to reflect the parent's setgid bit
6816 6824           */
6817 6825          setgid_flag = 0;
6818 6826          if (type == NF4DIR) {
6819 6827                  struct vattr dva;
6820 6828  
6821 6829                  va->va_mode &= ~VSGID;
6822 6830                  dva.va_mask = AT_MODE | AT_GID;
6823 6831                  if (VOP_GETATTR(dvp, &dva, 0, cr, NULL) == 0) {
6824 6832  
6825 6833                          /*
6826 6834                           * If the parent's directory has the setgid bit set
6827 6835                           * _and_ the client was able to get a valid mapping
6828 6836                           * for the parent dir's owner_group, we want to
6829 6837                           * append NVERIFY(owner_group == dva.va_gid) and
6830 6838                           * SETTATTR to the CREATE compound.
6831 6839                           */
6832 6840                          if (mi->mi_flags & MI4_GRPID || dva.va_mode & VSGID) {
6833 6841                                  setgid_flag = 1;
6834 6842                                  va->va_mode |= VSGID;
6835 6843                                  if (dva.va_gid != GID_NOBODY) {
6836 6844                                          va->va_mask |= AT_GID;
6837 6845                                          va->va_gid = dva.va_gid;
6838 6846                                  }
6839 6847                          }
6840 6848                  }
6841 6849          }
6842 6850  
6843 6851          /*
6844 6852           * Create ops:
6845 6853           *      0:putfh(dir) 1:savefh(dir) 2:create 3:getfh(new) 4:getattr(new)
6846 6854           *      5:restorefh(dir) 6:getattr(dir)
6847 6855           *
6848 6856           * if (setgid)
6849 6857           *      0:putfh(dir) 1:create 2:getfh(new) 3:getattr(new)
6850 6858           *      4:savefh(new) 5:putfh(dir) 6:getattr(dir) 7:restorefh(new)
6851 6859           *      8:nverify 9:setattr
6852 6860           */
6853 6861          if (setgid_flag) {
6854 6862                  numops = 10;
6855 6863                  idx_create = 1;
6856 6864                  idx_fattr = 3;
6857 6865          } else {
6858 6866                  numops = 7;
6859 6867                  idx_create = 2;
6860 6868                  idx_fattr = 4;
6861 6869          }
6862 6870  
6863 6871          ASSERT(nfs_zone() == mi->mi_zone);
6864 6872          if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp))) {
6865 6873                  return (EINTR);
6866 6874          }
6867 6875          recov_state.rs_flags = 0;
6868 6876          recov_state.rs_num_retry_despite_err = 0;
6869 6877  
6870 6878          argoplist_size = numops * sizeof (nfs_argop4);
6871 6879          argop = kmem_alloc(argoplist_size, KM_SLEEP);
6872 6880  
6873 6881  recov_retry:
6874 6882          if (type == NF4LNK)
6875 6883                  args.ctag = TAG_SYMLINK;
6876 6884          else if (type == NF4DIR)
6877 6885                  args.ctag = TAG_MKDIR;
6878 6886          else
6879 6887                  args.ctag = TAG_MKNOD;
6880 6888  
6881 6889          args.array_len = numops;
6882 6890          args.array = argop;
6883 6891  
6884 6892          if (e.error = nfs4_start_op(mi, dvp, NULL, &recov_state)) {
6885 6893                  nfs_rw_exit(&drp->r_rwlock);
6886 6894                  kmem_free(argop, argoplist_size);
6887 6895                  return (e.error);
6888 6896          }
6889 6897          need_end_op = TRUE;
6890 6898  
6891 6899  
6892 6900          /* 0: putfh directory */
6893 6901          argop[0].argop = OP_CPUTFH;
6894 6902          argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
6895 6903  
6896 6904          /* 1/2: Create object */
6897 6905          argop[idx_create].argop = OP_CCREATE;
6898 6906          argop[idx_create].nfs_argop4_u.opccreate.cname = nm;
6899 6907          argop[idx_create].nfs_argop4_u.opccreate.type = type;
6900 6908          if (type == NF4LNK) {
6901 6909                  /*
6902 6910                   * symlink, treat name as data
6903 6911                   */
6904 6912                  ASSERT(data != NULL);
6905 6913                  argop[idx_create].nfs_argop4_u.opccreate.ftype4_u.clinkdata =
6906 6914                      (char *)data;
6907 6915          }
6908 6916          if (type == NF4BLK || type == NF4CHR) {
6909 6917                  ASSERT(data != NULL);
6910 6918                  argop[idx_create].nfs_argop4_u.opccreate.ftype4_u.devdata =
6911 6919                      *((specdata4 *)data);
6912 6920          }
6913 6921  
6914 6922          crattr = &argop[idx_create].nfs_argop4_u.opccreate.createattrs;
6915 6923  
6916 6924          svp = drp->r_server;
6917 6925          (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
6918 6926          supp_attrs = svp->sv_supp_attrs;
6919 6927          nfs_rw_exit(&svp->sv_lock);
6920 6928  
6921 6929          if (vattr_to_fattr4(va, NULL, crattr, 0, OP_CREATE, supp_attrs)) {
6922 6930                  nfs_rw_exit(&drp->r_rwlock);
6923 6931                  nfs4_end_op(mi, dvp, NULL, &recov_state, needrecov);
6924 6932                  e.error = EINVAL;
6925 6933                  kmem_free(argop, argoplist_size);
6926 6934                  return (e.error);
6927 6935          }
6928 6936  
6929 6937          /* 2/3: getfh fh of created object */
6930 6938          ASSERT(idx_create + 1 == idx_fattr - 1);
6931 6939          argop[idx_create + 1].argop = OP_GETFH;
6932 6940  
6933 6941          /* 3/4: getattr of new object */
6934 6942          argop[idx_fattr].argop = OP_GETATTR;
6935 6943          argop[idx_fattr].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
6936 6944          argop[idx_fattr].nfs_argop4_u.opgetattr.mi = mi;
6937 6945  
6938 6946          if (setgid_flag) {
6939 6947                  vattr_t _v;
6940 6948  
6941 6949                  argop[4].argop = OP_SAVEFH;
6942 6950  
6943 6951                  argop[5].argop = OP_CPUTFH;
6944 6952                  argop[5].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
6945 6953  
6946 6954                  argop[6].argop = OP_GETATTR;
6947 6955                  argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
6948 6956                  argop[6].nfs_argop4_u.opgetattr.mi = mi;
6949 6957  
6950 6958                  argop[7].argop = OP_RESTOREFH;
6951 6959  
6952 6960                  /*
6953 6961                   * nverify
6954 6962                   *
6955 6963                   * XXX - Revisit the last argument to nfs4_end_op()
6956 6964                   *       once 5020486 is fixed.
6957 6965                   */
6958 6966                  _v.va_mask = AT_GID;
6959 6967                  _v.va_gid = va->va_gid;
6960 6968                  if (e.error = nfs4args_verify(&argop[8], &_v, OP_NVERIFY,
6961 6969                      supp_attrs)) {
6962 6970                          nfs4_end_op(mi, dvp, *vpp, &recov_state, TRUE);
6963 6971                          nfs_rw_exit(&drp->r_rwlock);
6964 6972                          nfs4_fattr4_free(crattr);
6965 6973                          kmem_free(argop, argoplist_size);
6966 6974                          return (e.error);
6967 6975                  }
6968 6976  
6969 6977                  /*
6970 6978                   * setattr
6971 6979                   *
6972 6980                   * We _know_ we're not messing with AT_SIZE or AT_XTIME,
6973 6981                   * so no need for stateid or flags. Also we specify NULL
6974 6982                   * rp since we're only interested in setting owner_group
6975 6983                   * attributes.
6976 6984                   */
6977 6985                  nfs4args_setattr(&argop[9], &_v, NULL, 0, NULL, cr, supp_attrs,
6978 6986                      &e.error, 0);
6979 6987  
6980 6988                  if (e.error) {
6981 6989                          nfs4_end_op(mi, dvp, *vpp, &recov_state, TRUE);
6982 6990                          nfs_rw_exit(&drp->r_rwlock);
6983 6991                          nfs4_fattr4_free(crattr);
6984 6992                          nfs4args_verify_free(&argop[8]);
6985 6993                          kmem_free(argop, argoplist_size);
6986 6994                          return (e.error);
6987 6995                  }
6988 6996          } else {
6989 6997                  argop[1].argop = OP_SAVEFH;
6990 6998  
6991 6999                  argop[5].argop = OP_RESTOREFH;
6992 7000  
6993 7001                  argop[6].argop = OP_GETATTR;
6994 7002                  argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
6995 7003                  argop[6].nfs_argop4_u.opgetattr.mi = mi;
6996 7004          }
6997 7005  
6998 7006          dnlc_remove(dvp, nm);
6999 7007  
7000 7008          doqueue = 1;
7001 7009          t = gethrtime();
7002 7010          rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
7003 7011  
7004 7012          needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
7005 7013          if (e.error) {
7006 7014                  PURGE_ATTRCACHE4(dvp);
7007 7015                  if (!needrecov)
7008 7016                          goto out;
7009 7017          }
7010 7018  
7011 7019          if (needrecov) {
7012 7020                  if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL,
7013 7021                      OP_CREATE, NULL, NULL, NULL) == FALSE) {
7014 7022                          nfs4_end_op(mi, dvp, NULL, &recov_state,
7015 7023                              needrecov);
7016 7024                          need_end_op = FALSE;
7017 7025                          nfs4_fattr4_free(crattr);
7018 7026                          if (setgid_flag) {
7019 7027                                  nfs4args_verify_free(&argop[8]);
7020 7028                                  nfs4args_setattr_free(&argop[9]);
7021 7029                          }
7022 7030                          (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
7023 7031                          goto recov_retry;
7024 7032                  }
7025 7033          }
7026 7034  
7027 7035          resp = &res;
7028 7036  
7029 7037          if (res.status != NFS4_OK && res.array_len <= idx_fattr + 1) {
7030 7038  
7031 7039                  if (res.status == NFS4ERR_BADOWNER)
7032 7040                          nfs4_log_badowner(mi, OP_CREATE);
7033 7041  
7034 7042                  e.error = geterrno4(res.status);
7035 7043  
7036 7044                  /*
7037 7045                   * This check is left over from when create was implemented
7038 7046                   * using a setattr op (instead of createattrs).  If the
7039 7047                   * putfh/create/getfh failed, the error was returned.  If
7040 7048                   * setattr/getattr failed, we keep going.
7041 7049                   *
7042 7050                   * It might be better to get rid of the GETFH also, and just
7043 7051                   * do PUTFH/CREATE/GETATTR since the FH attr is mandatory.
7044 7052                   * Then if any of the operations failed, we could return the
7045 7053                   * error now, and remove much of the error code below.
7046 7054                   */
7047 7055                  if (res.array_len <= idx_fattr) {
7048 7056                          /*
7049 7057                           * Either Putfh, Create or Getfh failed.
7050 7058                           */
7051 7059                          PURGE_ATTRCACHE4(dvp);
7052 7060                          /*
7053 7061                           * nfs4_purge_stale_fh() may generate otw calls through
7054 7062                           * nfs4_invalidate_pages. Hence the need to call
7055 7063                           * nfs4_end_op() here to avoid nfs4_start_op() deadlock.
7056 7064                           */
7057 7065                          nfs4_end_op(mi, dvp, NULL, &recov_state,
7058 7066                              needrecov);
7059 7067                          need_end_op = FALSE;
7060 7068                          nfs4_purge_stale_fh(e.error, dvp, cr);
7061 7069                          goto out;
7062 7070                  }
7063 7071          }
7064 7072  
7065 7073          resop = &res.array[idx_create]; /* create res */
7066 7074          cinfo = &resop->nfs_resop4_u.opcreate.cinfo;
7067 7075  
7068 7076          resop = &res.array[idx_create + 1]; /* getfh res */
7069 7077          gf_res = &resop->nfs_resop4_u.opgetfh;
7070 7078  
7071 7079          sfhp = sfh4_get(&gf_res->object, mi);
7072 7080          if (e.error) {
7073 7081                  *vpp = vp = makenfs4node(sfhp, NULL, dvp->v_vfsp, t, cr, dvp,
7074 7082                      fn_get(VTOSV(dvp)->sv_name, nm, sfhp));
7075 7083                  if (vp->v_type == VNON) {
7076 7084                          vattr.va_mask = AT_TYPE;
7077 7085                          /*
7078 7086                           * Need to call nfs4_end_op before nfs4getattr to avoid
7079 7087                           * potential nfs4_start_op deadlock. See RFE 4777612.
7080 7088                           */
7081 7089                          nfs4_end_op(mi, dvp, NULL, &recov_state,
7082 7090                              needrecov);
7083 7091                          need_end_op = FALSE;
7084 7092                          e.error = nfs4getattr(vp, &vattr, cr);
7085 7093                          if (e.error) {
7086 7094                                  VN_RELE(vp);
7087 7095                                  *vpp = NULL;
7088 7096                                  goto out;
7089 7097                          }
7090 7098                          vp->v_type = vattr.va_type;
7091 7099                  }
7092 7100                  e.error = 0;
7093 7101          } else {
7094 7102                  *vpp = vp = makenfs4node(sfhp,
7095 7103                      &res.array[idx_fattr].nfs_resop4_u.opgetattr.ga_res,
7096 7104                      dvp->v_vfsp, t, cr,
7097 7105                      dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp));
7098 7106          }
7099 7107  
7100 7108          /*
7101 7109           * If compound succeeded, then update dir attrs
7102 7110           */
7103 7111          if (res.status == NFS4_OK) {
7104 7112                  dinfo.di_garp = &res.array[6].nfs_resop4_u.opgetattr.ga_res;
7105 7113                  dinfo.di_cred = cr;
7106 7114                  dinfo.di_time_call = t;
7107 7115                  dinfop = &dinfo;
7108 7116          } else
7109 7117                  dinfop = NULL;
7110 7118  
7111 7119          /* Update directory cache attribute, readdir and dnlc caches */
7112 7120          nfs4_update_dircaches(cinfo, dvp, vp, nm, dinfop);
7113 7121  
7114 7122  out:
7115 7123          if (sfhp != NULL)
7116 7124                  sfh4_rele(&sfhp);
7117 7125          nfs_rw_exit(&drp->r_rwlock);
7118 7126          nfs4_fattr4_free(crattr);
7119 7127          if (setgid_flag) {
7120 7128                  nfs4args_verify_free(&argop[8]);
7121 7129                  nfs4args_setattr_free(&argop[9]);
7122 7130          }
7123 7131          if (resp)
7124 7132                  (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
7125 7133          if (need_end_op)
7126 7134                  nfs4_end_op(mi, dvp, NULL, &recov_state, needrecov);
7127 7135  
7128 7136          kmem_free(argop, argoplist_size);
7129 7137          return (e.error);
7130 7138  }
7131 7139  
7132 7140  /* ARGSUSED */
7133 7141  static int
7134 7142  nfs4mknod(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive,
7135 7143      int mode, vnode_t **vpp, cred_t *cr)
7136 7144  {
7137 7145          int error;
7138 7146          vnode_t *vp;
7139 7147          nfs_ftype4 type;
7140 7148          specdata4 spec, *specp = NULL;
7141 7149  
7142 7150          ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
7143 7151  
7144 7152          switch (va->va_type) {
7145 7153          case VCHR:
7146 7154          case VBLK:
7147 7155                  type = (va->va_type == VCHR) ? NF4CHR : NF4BLK;
7148 7156                  spec.specdata1 = getmajor(va->va_rdev);
7149 7157                  spec.specdata2 = getminor(va->va_rdev);
7150 7158                  specp = &spec;
7151 7159                  break;
7152 7160  
7153 7161          case VFIFO:
7154 7162                  type = NF4FIFO;
7155 7163                  break;
7156 7164          case VSOCK:
7157 7165                  type = NF4SOCK;
7158 7166                  break;
7159 7167  
7160 7168          default:
7161 7169                  return (EINVAL);
7162 7170          }
7163 7171  
7164 7172          error = call_nfs4_create_req(dvp, nm, specp, va, &vp, cr, type);
7165 7173          if (error) {
7166 7174                  return (error);
7167 7175          }
7168 7176  
7169 7177          /*
7170 7178           * This might not be needed any more; special case to deal
7171 7179           * with problematic v2/v3 servers.  Since create was unable
7172 7180           * to set group correctly, not sure what hope setattr has.
7173 7181           */
7174 7182          if (va->va_gid != VTOR4(vp)->r_attr.va_gid) {
7175 7183                  va->va_mask = AT_GID;
7176 7184                  (void) nfs4setattr(vp, va, 0, cr, NULL);
7177 7185          }
7178 7186  
7179 7187          /*
7180 7188           * If vnode is a device create special vnode
7181 7189           */
7182 7190          if (ISVDEV(vp->v_type)) {
7183 7191                  *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
7184 7192                  VN_RELE(vp);
7185 7193          } else {
7186 7194                  *vpp = vp;
7187 7195          }
7188 7196          return (error);
7189 7197  }
7190 7198  
7191 7199  /*
7192 7200   * Remove requires that the current fh be the target directory.
7193 7201   * After the operation, the current fh is unchanged.
7194 7202   * The compound op structure is:
7195 7203   *      PUTFH(targetdir), REMOVE
7196 7204   *
7197 7205   * Weirdness: if the vnode to be removed is open
7198 7206   * we rename it instead of removing it and nfs_inactive
7199 7207   * will remove the new name.
7200 7208   */
7201 7209  /* ARGSUSED */
7202 7210  static int
7203 7211  nfs4_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct, int flags)
7204 7212  {
7205 7213          COMPOUND4args_clnt args;
7206 7214          COMPOUND4res_clnt res, *resp = NULL;
7207 7215          REMOVE4res *rm_res;
7208 7216          nfs_argop4 argop[3];
7209 7217          nfs_resop4 *resop;
7210 7218          vnode_t *vp;
7211 7219          char *tmpname;
7212 7220          int doqueue;
7213 7221          mntinfo4_t *mi;
7214 7222          rnode4_t *rp;
7215 7223          rnode4_t *drp;
7216 7224          int needrecov = 0;
7217 7225          nfs4_recov_state_t recov_state;
7218 7226          int isopen;
7219 7227          nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
7220 7228          dirattr_info_t dinfo;
7221 7229  
7222 7230          if (nfs_zone() != VTOMI4(dvp)->mi_zone)
7223 7231                  return (EPERM);
7224 7232          drp = VTOR4(dvp);
7225 7233          if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp)))
7226 7234                  return (EINTR);
7227 7235  
7228 7236          e.error = nfs4lookup(dvp, nm, &vp, cr, 0);
7229 7237          if (e.error) {
7230 7238                  nfs_rw_exit(&drp->r_rwlock);
7231 7239                  return (e.error);
7232 7240          }
7233 7241  
7234 7242          if (vp->v_type == VDIR) {
7235 7243                  VN_RELE(vp);
7236 7244                  nfs_rw_exit(&drp->r_rwlock);
7237 7245                  return (EISDIR);
7238 7246          }
7239 7247  
7240 7248          /*
7241 7249           * First just remove the entry from the name cache, as it
7242 7250           * is most likely the only entry for this vp.
7243 7251           */
7244 7252          dnlc_remove(dvp, nm);
7245 7253  
7246 7254          rp = VTOR4(vp);
7247 7255  
7248 7256          /*
7249 7257           * For regular file types, check to see if the file is open by looking
7250 7258           * at the open streams.
7251 7259           * For all other types, check the reference count on the vnode.  Since
7252 7260           * they are not opened OTW they never have an open stream.
7253 7261           *
7254 7262           * If the file is open, rename it to .nfsXXXX.
7255 7263           */
7256 7264          if (vp->v_type != VREG) {
7257 7265                  /*
7258 7266                   * If the file has a v_count > 1 then there may be more than one
7259 7267                   * entry in the name cache due multiple links or an open file,
7260 7268                   * but we don't have the real reference count so flush all
7261 7269                   * possible entries.
7262 7270                   */
7263 7271                  if (vp->v_count > 1)
7264 7272                          dnlc_purge_vp(vp);
7265 7273  
7266 7274                  /*
7267 7275                   * Now we have the real reference count.
7268 7276                   */
7269 7277                  isopen = vp->v_count > 1;
7270 7278          } else {
7271 7279                  mutex_enter(&rp->r_os_lock);
7272 7280                  isopen = list_head(&rp->r_open_streams) != NULL;
7273 7281                  mutex_exit(&rp->r_os_lock);
7274 7282          }
7275 7283  
7276 7284          mutex_enter(&rp->r_statelock);
7277 7285          if (isopen &&
7278 7286              (rp->r_unldvp == NULL || strcmp(nm, rp->r_unlname) == 0)) {
7279 7287                  mutex_exit(&rp->r_statelock);
7280 7288                  tmpname = newname();
7281 7289                  e.error = nfs4rename(dvp, nm, dvp, tmpname, cr, ct);
7282 7290                  if (e.error)
7283 7291                          kmem_free(tmpname, MAXNAMELEN);
7284 7292                  else {
7285 7293                          mutex_enter(&rp->r_statelock);
7286 7294                          if (rp->r_unldvp == NULL) {
7287 7295                                  VN_HOLD(dvp);
7288 7296                                  rp->r_unldvp = dvp;
7289 7297                                  if (rp->r_unlcred != NULL)
7290 7298                                          crfree(rp->r_unlcred);
7291 7299                                  crhold(cr);
7292 7300                                  rp->r_unlcred = cr;
7293 7301                                  rp->r_unlname = tmpname;
7294 7302                          } else {
7295 7303                                  kmem_free(rp->r_unlname, MAXNAMELEN);
7296 7304                                  rp->r_unlname = tmpname;
7297 7305                          }
7298 7306                          mutex_exit(&rp->r_statelock);
7299 7307                  }
7300 7308                  VN_RELE(vp);
7301 7309                  nfs_rw_exit(&drp->r_rwlock);
7302 7310                  return (e.error);
7303 7311          }
7304 7312          /*
7305 7313           * Actually remove the file/dir
7306 7314           */
7307 7315          mutex_exit(&rp->r_statelock);
7308 7316  
7309 7317          /*
7310 7318           * We need to flush any dirty pages which happen to
7311 7319           * be hanging around before removing the file.
7312 7320           * This shouldn't happen very often since in NFSv4
7313 7321           * we should be close to open consistent.
7314 7322           */
7315 7323          if (nfs4_has_pages(vp) &&
7316 7324              ((rp->r_flags & R4DIRTY) || rp->r_count > 0)) {
7317 7325                  e.error = nfs4_putpage(vp, (u_offset_t)0, 0, 0, cr, ct);
7318 7326                  if (e.error && (e.error == ENOSPC || e.error == EDQUOT)) {
7319 7327                          mutex_enter(&rp->r_statelock);
7320 7328                          if (!rp->r_error)
7321 7329                                  rp->r_error = e.error;
7322 7330                          mutex_exit(&rp->r_statelock);
7323 7331                  }
7324 7332          }
7325 7333  
7326 7334          mi = VTOMI4(dvp);
7327 7335  
7328 7336          (void) nfs4delegreturn(rp, NFS4_DR_REOPEN);
7329 7337          recov_state.rs_flags = 0;
7330 7338          recov_state.rs_num_retry_despite_err = 0;
7331 7339  
7332 7340  recov_retry:
7333 7341          /*
7334 7342           * Remove ops: putfh dir; remove
7335 7343           */
7336 7344          args.ctag = TAG_REMOVE;
7337 7345          args.array_len = 3;
7338 7346          args.array = argop;
7339 7347  
7340 7348          e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state);
7341 7349          if (e.error) {
7342 7350                  nfs_rw_exit(&drp->r_rwlock);
7343 7351                  VN_RELE(vp);
7344 7352                  return (e.error);
7345 7353          }
7346 7354  
7347 7355          /* putfh directory */
7348 7356          argop[0].argop = OP_CPUTFH;
7349 7357          argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
7350 7358  
7351 7359          /* remove */
7352 7360          argop[1].argop = OP_CREMOVE;
7353 7361          argop[1].nfs_argop4_u.opcremove.ctarget = nm;
7354 7362  
7355 7363          /* getattr dir */
7356 7364          argop[2].argop = OP_GETATTR;
7357 7365          argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
7358 7366          argop[2].nfs_argop4_u.opgetattr.mi = mi;
7359 7367  
7360 7368          doqueue = 1;
7361 7369          dinfo.di_time_call = gethrtime();
7362 7370          rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
7363 7371  
7364 7372          PURGE_ATTRCACHE4(vp);
7365 7373  
7366 7374          needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
7367 7375          if (e.error)
7368 7376                  PURGE_ATTRCACHE4(dvp);
7369 7377  
7370 7378          if (needrecov) {
7371 7379                  if (nfs4_start_recovery(&e, VTOMI4(dvp), dvp,
7372 7380                      NULL, NULL, NULL, OP_REMOVE, NULL, NULL, NULL) == FALSE) {
7373 7381                          if (!e.error)
7374 7382                                  (void) xdr_free(xdr_COMPOUND4res_clnt,
7375 7383                                      (caddr_t)&res);
7376 7384                          nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state,
7377 7385                              needrecov);
7378 7386                          goto recov_retry;
7379 7387                  }
7380 7388          }
7381 7389  
7382 7390          /*
7383 7391           * Matching nfs4_end_op() for start_op() above.
7384 7392           * There is a path in the code below which calls
7385 7393           * nfs4_purge_stale_fh(), which may generate otw calls through
7386 7394           * nfs4_invalidate_pages. Hence we need to call nfs4_end_op()
7387 7395           * here to avoid nfs4_start_op() deadlock.
7388 7396           */
7389 7397          nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
7390 7398  
7391 7399          if (!e.error) {
7392 7400                  resp = &res;
7393 7401  
7394 7402                  if (res.status) {
7395 7403                          e.error = geterrno4(res.status);
7396 7404                          PURGE_ATTRCACHE4(dvp);
7397 7405                          nfs4_purge_stale_fh(e.error, dvp, cr);
7398 7406                  } else {
7399 7407                          resop = &res.array[1];  /* remove res */
7400 7408                          rm_res = &resop->nfs_resop4_u.opremove;
7401 7409  
7402 7410                          dinfo.di_garp =
7403 7411                              &res.array[2].nfs_resop4_u.opgetattr.ga_res;
7404 7412                          dinfo.di_cred = cr;
7405 7413  
7406 7414                          /* Update directory attr, readdir and dnlc caches */
7407 7415                          nfs4_update_dircaches(&rm_res->cinfo, dvp, NULL, NULL,
7408 7416                              &dinfo);
7409 7417                  }
7410 7418          }
7411 7419          nfs_rw_exit(&drp->r_rwlock);
7412 7420          if (resp)
7413 7421                  (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
7414 7422  
7415 7423          if (e.error == 0) {
7416 7424                  vnode_t *tvp;
7417 7425                  rnode4_t *trp;
7418 7426                  trp = VTOR4(vp);
7419 7427                  tvp = vp;
7420 7428                  if (IS_SHADOW(vp, trp))
7421 7429                          tvp = RTOV4(trp);
7422 7430                  vnevent_remove(tvp, dvp, nm, ct);
7423 7431          }
7424 7432          VN_RELE(vp);
7425 7433          return (e.error);
7426 7434  }
7427 7435  
7428 7436  /*
7429 7437   * Link requires that the current fh be the target directory and the
7430 7438   * saved fh be the source fh. After the operation, the current fh is unchanged.
7431 7439   * Thus the compound op structure is:
7432 7440   *      PUTFH(file), SAVEFH, PUTFH(targetdir), LINK, RESTOREFH,
7433 7441   *      GETATTR(file)
7434 7442   */
7435 7443  /* ARGSUSED */
7436 7444  static int
7437 7445  nfs4_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr,
7438 7446      caller_context_t *ct, int flags)
7439 7447  {
7440 7448          COMPOUND4args_clnt args;
7441 7449          COMPOUND4res_clnt res, *resp = NULL;
7442 7450          LINK4res *ln_res;
7443 7451          int argoplist_size  = 7 * sizeof (nfs_argop4);
7444 7452          nfs_argop4 *argop;
7445 7453          nfs_resop4 *resop;
7446 7454          vnode_t *realvp, *nvp;
7447 7455          int doqueue;
7448 7456          mntinfo4_t *mi;
7449 7457          rnode4_t *tdrp;
7450 7458          bool_t needrecov = FALSE;
7451 7459          nfs4_recov_state_t recov_state;
7452 7460          hrtime_t t;
7453 7461          nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
7454 7462          dirattr_info_t dinfo;
7455 7463  
7456 7464          ASSERT(*tnm != '\0');
7457 7465          ASSERT(tdvp->v_type == VDIR);
7458 7466          ASSERT(nfs4_consistent_type(tdvp));
7459 7467          ASSERT(nfs4_consistent_type(svp));
7460 7468  
7461 7469          if (nfs_zone() != VTOMI4(tdvp)->mi_zone)
7462 7470                  return (EPERM);
7463 7471          if (VOP_REALVP(svp, &realvp, ct) == 0) {
7464 7472                  svp = realvp;
7465 7473                  ASSERT(nfs4_consistent_type(svp));
7466 7474          }
7467 7475  
7468 7476          tdrp = VTOR4(tdvp);
7469 7477          mi = VTOMI4(svp);
7470 7478  
7471 7479          if (!(mi->mi_flags & MI4_LINK)) {
7472 7480                  return (EOPNOTSUPP);
7473 7481          }
7474 7482          recov_state.rs_flags = 0;
7475 7483          recov_state.rs_num_retry_despite_err = 0;
7476 7484  
7477 7485          if (nfs_rw_enter_sig(&tdrp->r_rwlock, RW_WRITER, INTR4(tdvp)))
7478 7486                  return (EINTR);
7479 7487  
7480 7488  recov_retry:
7481 7489          argop = kmem_alloc(argoplist_size, KM_SLEEP);
7482 7490  
7483 7491          args.ctag = TAG_LINK;
7484 7492  
7485 7493          /*
7486 7494           * Link ops: putfh fl; savefh; putfh tdir; link; getattr(dir);
7487 7495           * restorefh; getattr(fl)
7488 7496           */
7489 7497          args.array_len = 7;
7490 7498          args.array = argop;
7491 7499  
7492 7500          e.error = nfs4_start_op(VTOMI4(svp), svp, tdvp, &recov_state);
7493 7501          if (e.error) {
7494 7502                  kmem_free(argop, argoplist_size);
7495 7503                  nfs_rw_exit(&tdrp->r_rwlock);
7496 7504                  return (e.error);
7497 7505          }
7498 7506  
7499 7507          /* 0. putfh file */
7500 7508          argop[0].argop = OP_CPUTFH;
7501 7509          argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(svp)->r_fh;
7502 7510  
7503 7511          /* 1. save current fh to free up the space for the dir */
7504 7512          argop[1].argop = OP_SAVEFH;
7505 7513  
7506 7514          /* 2. putfh targetdir */
7507 7515          argop[2].argop = OP_CPUTFH;
7508 7516          argop[2].nfs_argop4_u.opcputfh.sfh = tdrp->r_fh;
7509 7517  
7510 7518          /* 3. link: current_fh is targetdir, saved_fh is source */
7511 7519          argop[3].argop = OP_CLINK;
7512 7520          argop[3].nfs_argop4_u.opclink.cnewname = tnm;
7513 7521  
7514 7522          /* 4. Get attributes of dir */
7515 7523          argop[4].argop = OP_GETATTR;
7516 7524          argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
7517 7525          argop[4].nfs_argop4_u.opgetattr.mi = mi;
7518 7526  
7519 7527          /* 5. If link was successful, restore current vp to file */
7520 7528          argop[5].argop = OP_RESTOREFH;
7521 7529  
7522 7530          /* 6. Get attributes of linked object */
7523 7531          argop[6].argop = OP_GETATTR;
7524 7532          argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
7525 7533          argop[6].nfs_argop4_u.opgetattr.mi = mi;
7526 7534  
7527 7535          dnlc_remove(tdvp, tnm);
7528 7536  
7529 7537          doqueue = 1;
7530 7538          t = gethrtime();
7531 7539  
7532 7540          rfs4call(VTOMI4(svp), &args, &res, cr, &doqueue, 0, &e);
7533 7541  
7534 7542          needrecov = nfs4_needs_recovery(&e, FALSE, svp->v_vfsp);
7535 7543          if (e.error != 0 && !needrecov) {
7536 7544                  PURGE_ATTRCACHE4(tdvp);
7537 7545                  PURGE_ATTRCACHE4(svp);
7538 7546                  nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state, needrecov);
7539 7547                  goto out;
7540 7548          }
7541 7549  
7542 7550          if (needrecov) {
7543 7551                  bool_t abort;
7544 7552  
7545 7553                  abort = nfs4_start_recovery(&e, VTOMI4(svp), svp, tdvp,
7546 7554                      NULL, NULL, OP_LINK, NULL, NULL, NULL);
7547 7555                  if (abort == FALSE) {
7548 7556                          nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state,
7549 7557                              needrecov);
7550 7558                          kmem_free(argop, argoplist_size);
7551 7559                          if (!e.error)
7552 7560                                  (void) xdr_free(xdr_COMPOUND4res_clnt,
7553 7561                                      (caddr_t)&res);
7554 7562                          goto recov_retry;
7555 7563                  } else {
7556 7564                          if (e.error != 0) {
7557 7565                                  PURGE_ATTRCACHE4(tdvp);
7558 7566                                  PURGE_ATTRCACHE4(svp);
7559 7567                                  nfs4_end_op(VTOMI4(svp), svp, tdvp,
7560 7568                                      &recov_state, needrecov);
7561 7569                                  goto out;
7562 7570                          }
7563 7571                          /* fall through for res.status case */
7564 7572                  }
7565 7573          }
7566 7574  
7567 7575          nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state, needrecov);
7568 7576  
7569 7577          resp = &res;
7570 7578          if (res.status) {
7571 7579                  /* If link succeeded, then don't return error */
7572 7580                  e.error = geterrno4(res.status);
7573 7581                  if (res.array_len <= 4) {
7574 7582                          /*
7575 7583                           * Either Putfh, Savefh, Putfh dir, or Link failed
7576 7584                           */
7577 7585                          PURGE_ATTRCACHE4(svp);
7578 7586                          PURGE_ATTRCACHE4(tdvp);
7579 7587                          if (e.error == EOPNOTSUPP) {
7580 7588                                  mutex_enter(&mi->mi_lock);
7581 7589                                  mi->mi_flags &= ~MI4_LINK;
7582 7590                                  mutex_exit(&mi->mi_lock);
7583 7591                          }
7584 7592                          /* Remap EISDIR to EPERM for non-root user for SVVS */
7585 7593                          /* XXX-LP */
7586 7594                          if (e.error == EISDIR && crgetuid(cr) != 0)
7587 7595                                  e.error = EPERM;
7588 7596                          goto out;
7589 7597                  }
7590 7598          }
7591 7599  
7592 7600          /* either no error or one of the postop getattr failed */
7593 7601  
7594 7602          /*
7595 7603           * XXX - if LINK succeeded, but no attrs were returned for link
7596 7604           * file, purge its cache.
7597 7605           *
7598 7606           * XXX Perform a simplified version of wcc checking. Instead of
7599 7607           * have another getattr to get pre-op, just purge cache if
7600 7608           * any of the ops prior to and including the getattr failed.
7601 7609           * If the getattr succeeded then update the attrcache accordingly.
7602 7610           */
7603 7611  
7604 7612          /*
7605 7613           * update cache with link file postattrs.
7606 7614           * Note: at this point resop points to link res.
7607 7615           */
7608 7616          resop = &res.array[3];  /* link res */
7609 7617          ln_res = &resop->nfs_resop4_u.oplink;
7610 7618          if (res.status == NFS4_OK)
7611 7619                  e.error = nfs4_update_attrcache(res.status,
7612 7620                      &res.array[6].nfs_resop4_u.opgetattr.ga_res,
7613 7621                      t, svp, cr);
7614 7622  
7615 7623          /*
7616 7624           * Call makenfs4node to create the new shadow vp for tnm.
7617 7625           * We pass NULL attrs because we just cached attrs for
7618 7626           * the src object.  All we're trying to accomplish is to
7619 7627           * to create the new shadow vnode.
7620 7628           */
7621 7629          nvp = makenfs4node(VTOR4(svp)->r_fh, NULL, tdvp->v_vfsp, t, cr,
7622 7630              tdvp, fn_get(VTOSV(tdvp)->sv_name, tnm, VTOR4(svp)->r_fh));
7623 7631  
7624 7632          /* Update target cache attribute, readdir and dnlc caches */
7625 7633          dinfo.di_garp = &res.array[4].nfs_resop4_u.opgetattr.ga_res;
7626 7634          dinfo.di_time_call = t;
7627 7635          dinfo.di_cred = cr;
7628 7636  
7629 7637          nfs4_update_dircaches(&ln_res->cinfo, tdvp, nvp, tnm, &dinfo);
7630 7638          ASSERT(nfs4_consistent_type(tdvp));
7631 7639          ASSERT(nfs4_consistent_type(svp));
7632 7640          ASSERT(nfs4_consistent_type(nvp));
7633 7641          VN_RELE(nvp);
7634 7642  
7635 7643          if (!e.error) {
7636 7644                  vnode_t *tvp;
7637 7645                  rnode4_t *trp;
7638 7646                  /*
7639 7647                   * Notify the source file of this link operation.
7640 7648                   */
7641 7649                  trp = VTOR4(svp);
7642 7650                  tvp = svp;
7643 7651                  if (IS_SHADOW(svp, trp))
7644 7652                          tvp = RTOV4(trp);
7645 7653                  vnevent_link(tvp, ct);
7646 7654          }
7647 7655  out:
7648 7656          kmem_free(argop, argoplist_size);
7649 7657          if (resp)
7650 7658                  (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
7651 7659  
7652 7660          nfs_rw_exit(&tdrp->r_rwlock);
7653 7661  
7654 7662          return (e.error);
7655 7663  }
7656 7664  
7657 7665  /* ARGSUSED */
7658 7666  static int
7659 7667  nfs4_rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr,
7660 7668      caller_context_t *ct, int flags)
7661 7669  {
7662 7670          vnode_t *realvp;
7663 7671  
7664 7672          if (nfs_zone() != VTOMI4(odvp)->mi_zone)
7665 7673                  return (EPERM);
7666 7674          if (VOP_REALVP(ndvp, &realvp, ct) == 0)
7667 7675                  ndvp = realvp;
7668 7676  
7669 7677          return (nfs4rename(odvp, onm, ndvp, nnm, cr, ct));
7670 7678  }
7671 7679  
7672 7680  /*
7673 7681   * nfs4rename does the real work of renaming in NFS Version 4.
7674 7682   *
7675 7683   * A file handle is considered volatile for renaming purposes if either
7676 7684   * of the volatile bits are turned on. However, the compound may differ
7677 7685   * based on the likelihood of the filehandle to change during rename.
7678 7686   */
7679 7687  static int
7680 7688  nfs4rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr,
7681 7689      caller_context_t *ct)
7682 7690  {
7683 7691          int error;
7684 7692          mntinfo4_t *mi;
7685 7693          vnode_t *nvp = NULL;
7686 7694          vnode_t *ovp = NULL;
7687 7695          char *tmpname = NULL;
7688 7696          rnode4_t *rp;
7689 7697          rnode4_t *odrp;
7690 7698          rnode4_t *ndrp;
7691 7699          int did_link = 0;
7692 7700          int do_link = 1;
7693 7701          nfsstat4 stat = NFS4_OK;
7694 7702  
7695 7703          ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone);
7696 7704          ASSERT(nfs4_consistent_type(odvp));
7697 7705          ASSERT(nfs4_consistent_type(ndvp));
7698 7706  
7699 7707          if (onm[0] == '.' && (onm[1] == '\0' ||
7700 7708              (onm[1] == '.' && onm[2] == '\0')))
7701 7709                  return (EINVAL);
7702 7710  
7703 7711          if (nnm[0] == '.' && (nnm[1] == '\0' ||
7704 7712              (nnm[1] == '.' && nnm[2] == '\0')))
7705 7713                  return (EINVAL);
7706 7714  
7707 7715          odrp = VTOR4(odvp);
7708 7716          ndrp = VTOR4(ndvp);
7709 7717          if ((intptr_t)odrp < (intptr_t)ndrp) {
7710 7718                  if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR4(odvp)))
7711 7719                          return (EINTR);
7712 7720                  if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR4(ndvp))) {
7713 7721                          nfs_rw_exit(&odrp->r_rwlock);
7714 7722                          return (EINTR);
7715 7723                  }
7716 7724          } else {
7717 7725                  if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR4(ndvp)))
7718 7726                          return (EINTR);
7719 7727                  if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR4(odvp))) {
7720 7728                          nfs_rw_exit(&ndrp->r_rwlock);
7721 7729                          return (EINTR);
7722 7730                  }
7723 7731          }
7724 7732  
7725 7733          /*
7726 7734           * Lookup the target file.  If it exists, it needs to be
7727 7735           * checked to see whether it is a mount point and whether
7728 7736           * it is active (open).
7729 7737           */
7730 7738          error = nfs4lookup(ndvp, nnm, &nvp, cr, 0);
7731 7739          if (!error) {
7732 7740                  int     isactive;
7733 7741  
7734 7742                  ASSERT(nfs4_consistent_type(nvp));
7735 7743                  /*
7736 7744                   * If this file has been mounted on, then just
7737 7745                   * return busy because renaming to it would remove
7738 7746                   * the mounted file system from the name space.
7739 7747                   */
7740 7748                  if (vn_ismntpt(nvp)) {
7741 7749                          VN_RELE(nvp);
7742 7750                          nfs_rw_exit(&odrp->r_rwlock);
7743 7751                          nfs_rw_exit(&ndrp->r_rwlock);
7744 7752                          return (EBUSY);
7745 7753                  }
7746 7754  
7747 7755                  /*
7748 7756                   * First just remove the entry from the name cache, as it
7749 7757                   * is most likely the only entry for this vp.
7750 7758                   */
7751 7759                  dnlc_remove(ndvp, nnm);
7752 7760  
7753 7761                  rp = VTOR4(nvp);
7754 7762  
7755 7763                  if (nvp->v_type != VREG) {
7756 7764                          /*
7757 7765                           * Purge the name cache of all references to this vnode
7758 7766                           * so that we can check the reference count to infer
7759 7767                           * whether it is active or not.
7760 7768                           */
7761 7769                          if (nvp->v_count > 1)
7762 7770                                  dnlc_purge_vp(nvp);
7763 7771  
7764 7772                          isactive = nvp->v_count > 1;
7765 7773                  } else {
7766 7774                          mutex_enter(&rp->r_os_lock);
7767 7775                          isactive = list_head(&rp->r_open_streams) != NULL;
7768 7776                          mutex_exit(&rp->r_os_lock);
7769 7777                  }
7770 7778  
7771 7779                  /*
7772 7780                   * If the vnode is active and is not a directory,
7773 7781                   * arrange to rename it to a
7774 7782                   * temporary file so that it will continue to be
7775 7783                   * accessible.  This implements the "unlink-open-file"
7776 7784                   * semantics for the target of a rename operation.
7777 7785                   * Before doing this though, make sure that the
7778 7786                   * source and target files are not already the same.
7779 7787                   */
7780 7788                  if (isactive && nvp->v_type != VDIR) {
7781 7789                          /*
7782 7790                           * Lookup the source name.
7783 7791                           */
7784 7792                          error = nfs4lookup(odvp, onm, &ovp, cr, 0);
7785 7793  
7786 7794                          /*
7787 7795                           * The source name *should* already exist.
7788 7796                           */
7789 7797                          if (error) {
7790 7798                                  VN_RELE(nvp);
7791 7799                                  nfs_rw_exit(&odrp->r_rwlock);
7792 7800                                  nfs_rw_exit(&ndrp->r_rwlock);
7793 7801                                  return (error);
7794 7802                          }
7795 7803  
7796 7804                          ASSERT(nfs4_consistent_type(ovp));
7797 7805  
7798 7806                          /*
7799 7807                           * Compare the two vnodes.  If they are the same,
7800 7808                           * just release all held vnodes and return success.
7801 7809                           */
7802 7810                          if (VN_CMP(ovp, nvp)) {
7803 7811                                  VN_RELE(ovp);
7804 7812                                  VN_RELE(nvp);
7805 7813                                  nfs_rw_exit(&odrp->r_rwlock);
7806 7814                                  nfs_rw_exit(&ndrp->r_rwlock);
7807 7815                                  return (0);
7808 7816                          }
7809 7817  
7810 7818                          /*
7811 7819                           * Can't mix and match directories and non-
7812 7820                           * directories in rename operations.  We already
7813 7821                           * know that the target is not a directory.  If
7814 7822                           * the source is a directory, return an error.
7815 7823                           */
7816 7824                          if (ovp->v_type == VDIR) {
7817 7825                                  VN_RELE(ovp);
7818 7826                                  VN_RELE(nvp);
7819 7827                                  nfs_rw_exit(&odrp->r_rwlock);
7820 7828                                  nfs_rw_exit(&ndrp->r_rwlock);
7821 7829                                  return (ENOTDIR);
7822 7830                          }
7823 7831  link_call:
7824 7832                          /*
7825 7833                           * The target file exists, is not the same as
7826 7834                           * the source file, and is active.  We first
7827 7835                           * try to Link it to a temporary filename to
7828 7836                           * avoid having the server removing the file
7829 7837                           * completely (which could cause data loss to
7830 7838                           * the user's POV in the event the Rename fails
7831 7839                           * -- see bug 1165874).
7832 7840                           */
7833 7841                          /*
7834 7842                           * The do_link and did_link booleans are
7835 7843                           * introduced in the event we get NFS4ERR_FILE_OPEN
7836 7844                           * returned for the Rename.  Some servers can
7837 7845                           * not Rename over an Open file, so they return
7838 7846                           * this error.  The client needs to Remove the
7839 7847                           * newly created Link and do two Renames, just
7840 7848                           * as if the server didn't support LINK.
7841 7849                           */
7842 7850                          tmpname = newname();
7843 7851                          error = 0;
7844 7852  
7845 7853                          if (do_link) {
7846 7854                                  error = nfs4_link(ndvp, nvp, tmpname, cr,
7847 7855                                      NULL, 0);
7848 7856                          }
7849 7857                          if (error == EOPNOTSUPP || !do_link) {
7850 7858                                  error = nfs4_rename(ndvp, nnm, ndvp, tmpname,
7851 7859                                      cr, NULL, 0);
7852 7860                                  did_link = 0;
7853 7861                          } else {
7854 7862                                  did_link = 1;
7855 7863                          }
7856 7864                          if (error) {
7857 7865                                  kmem_free(tmpname, MAXNAMELEN);
7858 7866                                  VN_RELE(ovp);
7859 7867                                  VN_RELE(nvp);
7860 7868                                  nfs_rw_exit(&odrp->r_rwlock);
7861 7869                                  nfs_rw_exit(&ndrp->r_rwlock);
7862 7870                                  return (error);
7863 7871                          }
7864 7872  
7865 7873                          mutex_enter(&rp->r_statelock);
7866 7874                          if (rp->r_unldvp == NULL) {
7867 7875                                  VN_HOLD(ndvp);
7868 7876                                  rp->r_unldvp = ndvp;
7869 7877                                  if (rp->r_unlcred != NULL)
7870 7878                                          crfree(rp->r_unlcred);
7871 7879                                  crhold(cr);
7872 7880                                  rp->r_unlcred = cr;
7873 7881                                  rp->r_unlname = tmpname;
7874 7882                          } else {
7875 7883                                  if (rp->r_unlname)
7876 7884                                          kmem_free(rp->r_unlname, MAXNAMELEN);
7877 7885                                  rp->r_unlname = tmpname;
7878 7886                          }
7879 7887                          mutex_exit(&rp->r_statelock);
7880 7888                  }
7881 7889  
7882 7890                  (void) nfs4delegreturn(VTOR4(nvp), NFS4_DR_PUSH|NFS4_DR_REOPEN);
7883 7891  
7884 7892                  ASSERT(nfs4_consistent_type(nvp));
7885 7893          }
7886 7894  
7887 7895          if (ovp == NULL) {
7888 7896                  /*
7889 7897                   * When renaming directories to be a subdirectory of a
7890 7898                   * different parent, the dnlc entry for ".." will no
7891 7899                   * longer be valid, so it must be removed.
7892 7900                   *
7893 7901                   * We do a lookup here to determine whether we are renaming
7894 7902                   * a directory and we need to check if we are renaming
7895 7903                   * an unlinked file.  This might have already been done
7896 7904                   * in previous code, so we check ovp == NULL to avoid
7897 7905                   * doing it twice.
7898 7906                   */
7899 7907                  error = nfs4lookup(odvp, onm, &ovp, cr, 0);
7900 7908                  /*
7901 7909                   * The source name *should* already exist.
7902 7910                   */
7903 7911                  if (error) {
7904 7912                          nfs_rw_exit(&odrp->r_rwlock);
7905 7913                          nfs_rw_exit(&ndrp->r_rwlock);
7906 7914                          if (nvp) {
7907 7915                                  VN_RELE(nvp);
7908 7916                          }
7909 7917                          return (error);
7910 7918                  }
7911 7919                  ASSERT(ovp != NULL);
7912 7920                  ASSERT(nfs4_consistent_type(ovp));
7913 7921          }
7914 7922  
7915 7923          /*
7916 7924           * Is the object being renamed a dir, and if so, is
7917 7925           * it being renamed to a child of itself?  The underlying
7918 7926           * fs should ultimately return EINVAL for this case;
7919 7927           * however, buggy beta non-Solaris NFSv4 servers at
7920 7928           * interop testing events have allowed this behavior,
7921 7929           * and it caused our client to panic due to a recursive
7922 7930           * mutex_enter in fn_move.
7923 7931           *
7924 7932           * The tedious locking in fn_move could be changed to
7925 7933           * deal with this case, and the client could avoid the
7926 7934           * panic; however, the client would just confuse itself
7927 7935           * later and misbehave.  A better way to handle the broken
7928 7936           * server is to detect this condition and return EINVAL
7929 7937           * without ever sending the the bogus rename to the server.
7930 7938           * We know the rename is invalid -- just fail it now.
7931 7939           */
7932 7940          if (ovp->v_type == VDIR && VN_CMP(ndvp, ovp)) {
7933 7941                  VN_RELE(ovp);
7934 7942                  nfs_rw_exit(&odrp->r_rwlock);
7935 7943                  nfs_rw_exit(&ndrp->r_rwlock);
7936 7944                  if (nvp) {
7937 7945                          VN_RELE(nvp);
7938 7946                  }
7939 7947                  return (EINVAL);
7940 7948          }
7941 7949  
7942 7950          (void) nfs4delegreturn(VTOR4(ovp), NFS4_DR_PUSH|NFS4_DR_REOPEN);
7943 7951  
7944 7952          /*
7945 7953           * If FH4_VOL_RENAME or FH4_VOLATILE_ANY bits are set, it is
7946 7954           * possible for the filehandle to change due to the rename.
7947 7955           * If neither of these bits is set, but FH4_VOL_MIGRATION is set,
7948 7956           * the fh will not change because of the rename, but we still need
7949 7957           * to update its rnode entry with the new name for
7950 7958           * an eventual fh change due to migration. The FH4_NOEXPIRE_ON_OPEN
7951 7959           * has no effect on these for now, but for future improvements,
7952 7960           * we might want to use it too to simplify handling of files
7953 7961           * that are open with that flag on. (XXX)
7954 7962           */
7955 7963          mi = VTOMI4(odvp);
7956 7964          if (NFS4_VOLATILE_FH(mi))
7957 7965                  error = nfs4rename_volatile_fh(odvp, onm, ovp, ndvp, nnm, cr,
7958 7966                      &stat);
7959 7967          else
7960 7968                  error = nfs4rename_persistent_fh(odvp, onm, ovp, ndvp, nnm, cr,
7961 7969                      &stat);
7962 7970  
7963 7971          ASSERT(nfs4_consistent_type(odvp));
7964 7972          ASSERT(nfs4_consistent_type(ndvp));
7965 7973          ASSERT(nfs4_consistent_type(ovp));
7966 7974  
7967 7975          if (stat == NFS4ERR_FILE_OPEN && did_link) {
7968 7976                  do_link = 0;
7969 7977                  /*
7970 7978                   * Before the 'link_call' code, we did a nfs4_lookup
7971 7979                   * that puts a VN_HOLD on nvp.  After the nfs4_link
7972 7980                   * call we call VN_RELE to match that hold.  We need
7973 7981                   * to place an additional VN_HOLD here since we will
7974 7982                   * be hitting that VN_RELE again.
7975 7983                   */
7976 7984                  VN_HOLD(nvp);
7977 7985  
7978 7986                  (void) nfs4_remove(ndvp, tmpname, cr, NULL, 0);
7979 7987  
7980 7988                  /* Undo the unlinked file naming stuff we just did */
7981 7989                  mutex_enter(&rp->r_statelock);
7982 7990                  if (rp->r_unldvp) {
7983 7991                          VN_RELE(ndvp);
7984 7992                          rp->r_unldvp = NULL;
7985 7993                          if (rp->r_unlcred != NULL)
7986 7994                                  crfree(rp->r_unlcred);
7987 7995                          rp->r_unlcred = NULL;
7988 7996                          /* rp->r_unlanme points to tmpname */
7989 7997                          if (rp->r_unlname)
7990 7998                                  kmem_free(rp->r_unlname, MAXNAMELEN);
7991 7999                          rp->r_unlname = NULL;
7992 8000                  }
7993 8001                  mutex_exit(&rp->r_statelock);
7994 8002  
7995 8003                  if (nvp) {
7996 8004                          VN_RELE(nvp);
7997 8005                  }
7998 8006                  goto link_call;
7999 8007          }
8000 8008  
8001 8009          if (error) {
8002 8010                  VN_RELE(ovp);
8003 8011                  nfs_rw_exit(&odrp->r_rwlock);
8004 8012                  nfs_rw_exit(&ndrp->r_rwlock);
8005 8013                  if (nvp) {
8006 8014                          VN_RELE(nvp);
8007 8015                  }
8008 8016                  return (error);
8009 8017          }
8010 8018  
8011 8019          /*
8012 8020           * when renaming directories to be a subdirectory of a
8013 8021           * different parent, the dnlc entry for ".." will no
8014 8022           * longer be valid, so it must be removed
8015 8023           */
8016 8024          rp = VTOR4(ovp);
8017 8025          if (ndvp != odvp) {
8018 8026                  if (ovp->v_type == VDIR) {
8019 8027                          dnlc_remove(ovp, "..");
8020 8028                          if (rp->r_dir != NULL)
8021 8029                                  nfs4_purge_rddir_cache(ovp);
8022 8030                  }
8023 8031          }
8024 8032  
8025 8033          /*
8026 8034           * If we are renaming the unlinked file, update the
8027 8035           * r_unldvp and r_unlname as needed.
8028 8036           */
8029 8037          mutex_enter(&rp->r_statelock);
8030 8038          if (rp->r_unldvp != NULL) {
8031 8039                  if (strcmp(rp->r_unlname, onm) == 0) {
8032 8040                          (void) strncpy(rp->r_unlname, nnm, MAXNAMELEN);
8033 8041                          rp->r_unlname[MAXNAMELEN - 1] = '\0';
8034 8042                          if (ndvp != rp->r_unldvp) {
8035 8043                                  VN_RELE(rp->r_unldvp);
8036 8044                                  rp->r_unldvp = ndvp;
8037 8045                                  VN_HOLD(ndvp);
8038 8046                          }
8039 8047                  }
8040 8048          }
8041 8049          mutex_exit(&rp->r_statelock);
8042 8050  
8043 8051          /*
8044 8052           * Notify the rename vnevents to source vnode, and to the target
8045 8053           * vnode if it already existed.
8046 8054           */
8047 8055          if (error == 0) {
8048 8056                  vnode_t *tvp;
8049 8057                  rnode4_t *trp;
8050 8058                  /*
8051 8059                   * Notify the vnode. Each links is represented by
8052 8060                   * a different vnode, in nfsv4.
8053 8061                   */
8054 8062                  if (nvp) {
8055 8063                          trp = VTOR4(nvp);
8056 8064                          tvp = nvp;
8057 8065                          if (IS_SHADOW(nvp, trp))
8058 8066                                  tvp = RTOV4(trp);
8059 8067                          vnevent_rename_dest(tvp, ndvp, nnm, ct);
8060 8068                  }
8061 8069  
8062 8070                  /*
8063 8071                   * if the source and destination directory are not the
8064 8072                   * same notify the destination directory.
8065 8073                   */
8066 8074                  if (VTOR4(odvp) != VTOR4(ndvp)) {
8067 8075                          trp = VTOR4(ndvp);
8068 8076                          tvp = ndvp;
8069 8077                          if (IS_SHADOW(ndvp, trp))
8070 8078                                  tvp = RTOV4(trp);
8071 8079                          vnevent_rename_dest_dir(tvp, ct);
8072 8080                  }
8073 8081  
8074 8082                  trp = VTOR4(ovp);
8075 8083                  tvp = ovp;
8076 8084                  if (IS_SHADOW(ovp, trp))
8077 8085                          tvp = RTOV4(trp);
8078 8086                  vnevent_rename_src(tvp, odvp, onm, ct);
8079 8087          }
8080 8088  
8081 8089          if (nvp) {
8082 8090                  VN_RELE(nvp);
8083 8091          }
8084 8092          VN_RELE(ovp);
8085 8093  
8086 8094          nfs_rw_exit(&odrp->r_rwlock);
8087 8095          nfs_rw_exit(&ndrp->r_rwlock);
8088 8096  
8089 8097          return (error);
8090 8098  }
8091 8099  
8092 8100  /*
8093 8101   * When the parent directory has changed, sv_dfh must be updated
8094 8102   */
8095 8103  static void
8096 8104  update_parentdir_sfh(vnode_t *vp, vnode_t *ndvp)
8097 8105  {
8098 8106          svnode_t *sv = VTOSV(vp);
8099 8107          nfs4_sharedfh_t *old_dfh = sv->sv_dfh;
8100 8108          nfs4_sharedfh_t *new_dfh = VTOR4(ndvp)->r_fh;
8101 8109  
8102 8110          sfh4_hold(new_dfh);
8103 8111          sv->sv_dfh = new_dfh;
8104 8112          sfh4_rele(&old_dfh);
8105 8113  }
8106 8114  
8107 8115  /*
8108 8116   * nfs4rename_persistent does the otw portion of renaming in NFS Version 4,
8109 8117   * when it is known that the filehandle is persistent through rename.
8110 8118   *
8111 8119   * Rename requires that the current fh be the target directory and the
8112 8120   * saved fh be the source directory. After the operation, the current fh
8113 8121   * is unchanged.
8114 8122   * The compound op structure for persistent fh rename is:
8115 8123   *      PUTFH(sourcdir), SAVEFH, PUTFH(targetdir), RENAME
8116 8124   * Rather than bother with the directory postop args, we'll simply
8117 8125   * update that a change occurred in the cache, so no post-op getattrs.
8118 8126   */
8119 8127  static int
8120 8128  nfs4rename_persistent_fh(vnode_t *odvp, char *onm, vnode_t *renvp,
8121 8129      vnode_t *ndvp, char *nnm, cred_t *cr, nfsstat4 *statp)
8122 8130  {
8123 8131          COMPOUND4args_clnt args;
8124 8132          COMPOUND4res_clnt res, *resp = NULL;
8125 8133          nfs_argop4 *argop;
8126 8134          nfs_resop4 *resop;
8127 8135          int doqueue, argoplist_size;
8128 8136          mntinfo4_t *mi;
8129 8137          rnode4_t *odrp = VTOR4(odvp);
8130 8138          rnode4_t *ndrp = VTOR4(ndvp);
8131 8139          RENAME4res *rn_res;
8132 8140          bool_t needrecov;
8133 8141          nfs4_recov_state_t recov_state;
8134 8142          nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
8135 8143          dirattr_info_t dinfo, *dinfop;
8136 8144  
8137 8145          ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone);
8138 8146  
8139 8147          recov_state.rs_flags = 0;
8140 8148          recov_state.rs_num_retry_despite_err = 0;
8141 8149  
8142 8150          /*
8143 8151           * Rename ops: putfh sdir; savefh; putfh tdir; rename; getattr tdir
8144 8152           *
8145 8153           * If source/target are different dirs, then append putfh(src); getattr
8146 8154           */
8147 8155          args.array_len = (odvp == ndvp) ? 5 : 7;
8148 8156          argoplist_size = args.array_len * sizeof (nfs_argop4);
8149 8157          args.array = argop = kmem_alloc(argoplist_size, KM_SLEEP);
8150 8158  
8151 8159  recov_retry:
8152 8160          *statp = NFS4_OK;
8153 8161  
8154 8162          /* No need to Lookup the file, persistent fh */
8155 8163          args.ctag = TAG_RENAME;
8156 8164  
8157 8165          mi = VTOMI4(odvp);
8158 8166          e.error = nfs4_start_op(mi, odvp, ndvp, &recov_state);
8159 8167          if (e.error) {
8160 8168                  kmem_free(argop, argoplist_size);
8161 8169                  return (e.error);
8162 8170          }
8163 8171  
8164 8172          /* 0: putfh source directory */
8165 8173          argop[0].argop = OP_CPUTFH;
8166 8174          argop[0].nfs_argop4_u.opcputfh.sfh = odrp->r_fh;
8167 8175  
8168 8176          /* 1: Save source fh to free up current for target */
8169 8177          argop[1].argop = OP_SAVEFH;
8170 8178  
8171 8179          /* 2: putfh targetdir */
8172 8180          argop[2].argop = OP_CPUTFH;
8173 8181          argop[2].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh;
8174 8182  
8175 8183          /* 3: current_fh is targetdir, saved_fh is sourcedir */
8176 8184          argop[3].argop = OP_CRENAME;
8177 8185          argop[3].nfs_argop4_u.opcrename.coldname = onm;
8178 8186          argop[3].nfs_argop4_u.opcrename.cnewname = nnm;
8179 8187  
8180 8188          /* 4: getattr (targetdir) */
8181 8189          argop[4].argop = OP_GETATTR;
8182 8190          argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
8183 8191          argop[4].nfs_argop4_u.opgetattr.mi = mi;
8184 8192  
8185 8193          if (ndvp != odvp) {
8186 8194  
8187 8195                  /* 5: putfh (sourcedir) */
8188 8196                  argop[5].argop = OP_CPUTFH;
8189 8197                  argop[5].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh;
8190 8198  
8191 8199                  /* 6: getattr (sourcedir) */
8192 8200                  argop[6].argop = OP_GETATTR;
8193 8201                  argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
8194 8202                  argop[6].nfs_argop4_u.opgetattr.mi = mi;
8195 8203          }
8196 8204  
8197 8205          dnlc_remove(odvp, onm);
8198 8206          dnlc_remove(ndvp, nnm);
8199 8207  
8200 8208          doqueue = 1;
8201 8209          dinfo.di_time_call = gethrtime();
8202 8210          rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
8203 8211  
8204 8212          needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
8205 8213          if (e.error) {
8206 8214                  PURGE_ATTRCACHE4(odvp);
8207 8215                  PURGE_ATTRCACHE4(ndvp);
8208 8216          } else {
8209 8217                  *statp = res.status;
8210 8218          }
8211 8219  
8212 8220          if (needrecov) {
8213 8221                  if (nfs4_start_recovery(&e, mi, odvp, ndvp, NULL, NULL,
8214 8222                      OP_RENAME, NULL, NULL, NULL) == FALSE) {
8215 8223                          nfs4_end_op(mi, odvp, ndvp, &recov_state, needrecov);
8216 8224                          if (!e.error)
8217 8225                                  (void) xdr_free(xdr_COMPOUND4res_clnt,
8218 8226                                      (caddr_t)&res);
8219 8227                          goto recov_retry;
8220 8228                  }
8221 8229          }
8222 8230  
8223 8231          if (!e.error) {
8224 8232                  resp = &res;
8225 8233                  /*
8226 8234                   * as long as OP_RENAME
8227 8235                   */
8228 8236                  if (res.status != NFS4_OK && res.array_len <= 4) {
8229 8237                          e.error = geterrno4(res.status);
8230 8238                          PURGE_ATTRCACHE4(odvp);
8231 8239                          PURGE_ATTRCACHE4(ndvp);
8232 8240                          /*
8233 8241                           * System V defines rename to return EEXIST, not
8234 8242                           * ENOTEMPTY if the target directory is not empty.
8235 8243                           * Over the wire, the error is NFSERR_ENOTEMPTY
8236 8244                           * which geterrno4 maps to ENOTEMPTY.
8237 8245                           */
8238 8246                          if (e.error == ENOTEMPTY)
8239 8247                                  e.error = EEXIST;
8240 8248                  } else {
8241 8249  
8242 8250                          resop = &res.array[3];  /* rename res */
8243 8251                          rn_res = &resop->nfs_resop4_u.oprename;
8244 8252  
8245 8253                          if (res.status == NFS4_OK) {
8246 8254                                  /*
8247 8255                                   * Update target attribute, readdir and dnlc
8248 8256                                   * caches.
8249 8257                                   */
8250 8258                                  dinfo.di_garp =
8251 8259                                      &res.array[4].nfs_resop4_u.opgetattr.ga_res;
8252 8260                                  dinfo.di_cred = cr;
8253 8261                                  dinfop = &dinfo;
8254 8262                          } else
8255 8263                                  dinfop = NULL;
8256 8264  
8257 8265                          nfs4_update_dircaches(&rn_res->target_cinfo,
8258 8266                              ndvp, NULL, NULL, dinfop);
8259 8267  
8260 8268                          /*
8261 8269                           * Update source attribute, readdir and dnlc caches
8262 8270                           *
8263 8271                           */
8264 8272                          if (ndvp != odvp) {
8265 8273                                  update_parentdir_sfh(renvp, ndvp);
8266 8274  
8267 8275                                  if (dinfop)
8268 8276                                          dinfo.di_garp =
8269 8277                                              &(res.array[6].nfs_resop4_u.
8270 8278                                              opgetattr.ga_res);
8271 8279  
8272 8280                                  nfs4_update_dircaches(&rn_res->source_cinfo,
8273 8281                                      odvp, NULL, NULL, dinfop);
8274 8282                          }
8275 8283  
8276 8284                          fn_move(VTOSV(renvp)->sv_name, VTOSV(ndvp)->sv_name,
8277 8285                              nnm);
8278 8286                  }
8279 8287          }
8280 8288  
8281 8289          if (resp)
8282 8290                  (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
8283 8291          nfs4_end_op(mi, odvp, ndvp, &recov_state, needrecov);
8284 8292          kmem_free(argop, argoplist_size);
8285 8293  
8286 8294          return (e.error);
8287 8295  }
8288 8296  
8289 8297  /*
8290 8298   * nfs4rename_volatile_fh does the otw part of renaming in NFS Version 4, when
8291 8299   * it is possible for the filehandle to change due to the rename.
8292 8300   *
8293 8301   * The compound req in this case includes a post-rename lookup and getattr
8294 8302   * to ensure that we have the correct fh and attributes for the object.
8295 8303   *
8296 8304   * Rename requires that the current fh be the target directory and the
8297 8305   * saved fh be the source directory. After the operation, the current fh
8298 8306   * is unchanged.
8299 8307   *
8300 8308   * We need the new filehandle (hence a LOOKUP and GETFH) so that we can
8301 8309   * update the filehandle for the renamed object.  We also get the old
8302 8310   * filehandle for historical reasons; this should be taken out sometime.
8303 8311   * This results in a rather cumbersome compound...
8304 8312   *
8305 8313   *    PUTFH(sourcdir), SAVEFH, LOOKUP(src), GETFH(old),
8306 8314   *    PUTFH(targetdir), RENAME, LOOKUP(trgt), GETFH(new), GETATTR
8307 8315   *
8308 8316   */
8309 8317  static int
8310 8318  nfs4rename_volatile_fh(vnode_t *odvp, char *onm, vnode_t *ovp,
8311 8319      vnode_t *ndvp, char *nnm, cred_t *cr, nfsstat4 *statp)
8312 8320  {
8313 8321          COMPOUND4args_clnt args;
8314 8322          COMPOUND4res_clnt res, *resp = NULL;
8315 8323          int argoplist_size;
8316 8324          nfs_argop4 *argop;
8317 8325          nfs_resop4 *resop;
8318 8326          int doqueue;
8319 8327          mntinfo4_t *mi;
8320 8328          rnode4_t *odrp = VTOR4(odvp);   /* old directory */
8321 8329          rnode4_t *ndrp = VTOR4(ndvp);   /* new directory */
8322 8330          rnode4_t *orp = VTOR4(ovp);     /* object being renamed */
8323 8331          RENAME4res *rn_res;
8324 8332          GETFH4res *ngf_res;
8325 8333          bool_t needrecov;
8326 8334          nfs4_recov_state_t recov_state;
8327 8335          hrtime_t t;
8328 8336          nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
8329 8337          dirattr_info_t dinfo, *dinfop = &dinfo;
8330 8338  
8331 8339          ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone);
8332 8340  
8333 8341          recov_state.rs_flags = 0;
8334 8342          recov_state.rs_num_retry_despite_err = 0;
8335 8343  
8336 8344  recov_retry:
8337 8345          *statp = NFS4_OK;
8338 8346  
8339 8347          /*
8340 8348           * There is a window between the RPC and updating the path and
8341 8349           * filehandle stored in the rnode.  Lock out the FHEXPIRED recovery
8342 8350           * code, so that it doesn't try to use the old path during that
8343 8351           * window.
8344 8352           */
8345 8353          mutex_enter(&orp->r_statelock);
8346 8354          while (orp->r_flags & R4RECEXPFH) {
8347 8355                  klwp_t *lwp = ttolwp(curthread);
8348 8356  
8349 8357                  if (lwp != NULL)
8350 8358                          lwp->lwp_nostop++;
8351 8359                  if (cv_wait_sig(&orp->r_cv, &orp->r_statelock) == 0) {
8352 8360                          mutex_exit(&orp->r_statelock);
8353 8361                          if (lwp != NULL)
8354 8362                                  lwp->lwp_nostop--;
8355 8363                          return (EINTR);
8356 8364                  }
8357 8365                  if (lwp != NULL)
8358 8366                          lwp->lwp_nostop--;
8359 8367          }
8360 8368          orp->r_flags |= R4RECEXPFH;
8361 8369          mutex_exit(&orp->r_statelock);
8362 8370  
8363 8371          mi = VTOMI4(odvp);
8364 8372  
8365 8373          args.ctag = TAG_RENAME_VFH;
8366 8374          args.array_len = (odvp == ndvp) ? 10 : 12;
8367 8375          argoplist_size  = args.array_len * sizeof (nfs_argop4);
8368 8376          argop = kmem_alloc(argoplist_size, KM_SLEEP);
8369 8377  
8370 8378          /*
8371 8379           * Rename ops:
8372 8380           *    PUTFH(sourcdir), SAVEFH, LOOKUP(src), GETFH(old),
8373 8381           *    PUTFH(targetdir), RENAME, GETATTR(targetdir)
8374 8382           *    LOOKUP(trgt), GETFH(new), GETATTR,
8375 8383           *
8376 8384           *    if (odvp != ndvp)
8377 8385           *      add putfh(sourcedir), getattr(sourcedir) }
8378 8386           */
8379 8387          args.array = argop;
8380 8388  
8381 8389          e.error = nfs4_start_fop(mi, odvp, ndvp, OH_VFH_RENAME,
8382 8390              &recov_state, NULL);
8383 8391          if (e.error) {
8384 8392                  kmem_free(argop, argoplist_size);
8385 8393                  mutex_enter(&orp->r_statelock);
8386 8394                  orp->r_flags &= ~R4RECEXPFH;
8387 8395                  cv_broadcast(&orp->r_cv);
8388 8396                  mutex_exit(&orp->r_statelock);
8389 8397                  return (e.error);
8390 8398          }
8391 8399  
8392 8400          /* 0: putfh source directory */
8393 8401          argop[0].argop = OP_CPUTFH;
8394 8402          argop[0].nfs_argop4_u.opcputfh.sfh = odrp->r_fh;
8395 8403  
8396 8404          /* 1: Save source fh to free up current for target */
8397 8405          argop[1].argop = OP_SAVEFH;
8398 8406  
8399 8407          /* 2: Lookup pre-rename fh of renamed object */
8400 8408          argop[2].argop = OP_CLOOKUP;
8401 8409          argop[2].nfs_argop4_u.opclookup.cname = onm;
8402 8410  
8403 8411          /* 3: getfh fh of renamed object (before rename) */
8404 8412          argop[3].argop = OP_GETFH;
8405 8413  
8406 8414          /* 4: putfh targetdir */
8407 8415          argop[4].argop = OP_CPUTFH;
8408 8416          argop[4].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh;
8409 8417  
8410 8418          /* 5: current_fh is targetdir, saved_fh is sourcedir */
8411 8419          argop[5].argop = OP_CRENAME;
8412 8420          argop[5].nfs_argop4_u.opcrename.coldname = onm;
8413 8421          argop[5].nfs_argop4_u.opcrename.cnewname = nnm;
8414 8422  
8415 8423          /* 6: getattr of target dir (post op attrs) */
8416 8424          argop[6].argop = OP_GETATTR;
8417 8425          argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
8418 8426          argop[6].nfs_argop4_u.opgetattr.mi = mi;
8419 8427  
8420 8428          /* 7: Lookup post-rename fh of renamed object */
8421 8429          argop[7].argop = OP_CLOOKUP;
8422 8430          argop[7].nfs_argop4_u.opclookup.cname = nnm;
8423 8431  
8424 8432          /* 8: getfh fh of renamed object (after rename) */
8425 8433          argop[8].argop = OP_GETFH;
8426 8434  
8427 8435          /* 9: getattr of renamed object */
8428 8436          argop[9].argop = OP_GETATTR;
8429 8437          argop[9].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
8430 8438          argop[9].nfs_argop4_u.opgetattr.mi = mi;
8431 8439  
8432 8440          /*
8433 8441           * If source/target dirs are different, then get new post-op
8434 8442           * attrs for source dir also.
8435 8443           */
8436 8444          if (ndvp != odvp) {
8437 8445                  /* 10: putfh (sourcedir) */
8438 8446                  argop[10].argop = OP_CPUTFH;
8439 8447                  argop[10].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh;
8440 8448  
8441 8449                  /* 11: getattr (sourcedir) */
8442 8450                  argop[11].argop = OP_GETATTR;
8443 8451                  argop[11].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
8444 8452                  argop[11].nfs_argop4_u.opgetattr.mi = mi;
8445 8453          }
8446 8454  
8447 8455          dnlc_remove(odvp, onm);
8448 8456          dnlc_remove(ndvp, nnm);
8449 8457  
8450 8458          doqueue = 1;
8451 8459          t = gethrtime();
8452 8460          rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
8453 8461  
8454 8462          needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
8455 8463          if (e.error) {
8456 8464                  PURGE_ATTRCACHE4(odvp);
8457 8465                  PURGE_ATTRCACHE4(ndvp);
8458 8466                  if (!needrecov) {
8459 8467                          nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME,
8460 8468                              &recov_state, needrecov);
8461 8469                          goto out;
8462 8470                  }
8463 8471          } else {
8464 8472                  *statp = res.status;
8465 8473          }
8466 8474  
8467 8475          if (needrecov) {
8468 8476                  bool_t abort;
8469 8477  
8470 8478                  abort = nfs4_start_recovery(&e, mi, odvp, ndvp, NULL, NULL,
8471 8479                      OP_RENAME, NULL, NULL, NULL);
8472 8480                  if (abort == FALSE) {
8473 8481                          nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME,
8474 8482                              &recov_state, needrecov);
8475 8483                          kmem_free(argop, argoplist_size);
8476 8484                          if (!e.error)
8477 8485                                  (void) xdr_free(xdr_COMPOUND4res_clnt,
8478 8486                                      (caddr_t)&res);
8479 8487                          mutex_enter(&orp->r_statelock);
8480 8488                          orp->r_flags &= ~R4RECEXPFH;
8481 8489                          cv_broadcast(&orp->r_cv);
8482 8490                          mutex_exit(&orp->r_statelock);
8483 8491                          goto recov_retry;
8484 8492                  } else {
8485 8493                          if (e.error != 0) {
8486 8494                                  nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME,
8487 8495                                      &recov_state, needrecov);
8488 8496                                  goto out;
8489 8497                          }
8490 8498                          /* fall through for res.status case */
8491 8499                  }
8492 8500          }
8493 8501  
8494 8502          resp = &res;
8495 8503          /*
8496 8504           * If OP_RENAME (or any prev op) failed, then return an error.
8497 8505           * OP_RENAME is index 5, so if array len <= 6 we return an error.
8498 8506           */
8499 8507          if ((res.status != NFS4_OK) && (res.array_len <= 6)) {
8500 8508                  /*
8501 8509                   * Error in an op other than last Getattr
8502 8510                   */
8503 8511                  e.error = geterrno4(res.status);
8504 8512                  PURGE_ATTRCACHE4(odvp);
8505 8513                  PURGE_ATTRCACHE4(ndvp);
8506 8514                  /*
8507 8515                   * System V defines rename to return EEXIST, not
8508 8516                   * ENOTEMPTY if the target directory is not empty.
8509 8517                   * Over the wire, the error is NFSERR_ENOTEMPTY
8510 8518                   * which geterrno4 maps to ENOTEMPTY.
8511 8519                   */
8512 8520                  if (e.error == ENOTEMPTY)
8513 8521                          e.error = EEXIST;
8514 8522                  nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, &recov_state,
8515 8523                      needrecov);
8516 8524                  goto out;
8517 8525          }
8518 8526  
8519 8527          /* rename results */
8520 8528          rn_res = &res.array[5].nfs_resop4_u.oprename;
8521 8529  
8522 8530          if (res.status == NFS4_OK) {
8523 8531                  /* Update target attribute, readdir and dnlc caches */
8524 8532                  dinfo.di_garp =
8525 8533                      &res.array[6].nfs_resop4_u.opgetattr.ga_res;
8526 8534                  dinfo.di_cred = cr;
8527 8535                  dinfo.di_time_call = t;
8528 8536          } else
8529 8537                  dinfop = NULL;
8530 8538  
8531 8539          /* Update source cache attribute, readdir and dnlc caches */
8532 8540          nfs4_update_dircaches(&rn_res->target_cinfo, ndvp, NULL, NULL, dinfop);
8533 8541  
8534 8542          /* Update source cache attribute, readdir and dnlc caches */
8535 8543          if (ndvp != odvp) {
8536 8544                  update_parentdir_sfh(ovp, ndvp);
8537 8545  
8538 8546                  /*
8539 8547                   * If dinfop is non-NULL, then compound succeded, so
8540 8548                   * set di_garp to attrs for source dir.  dinfop is only
8541 8549                   * set to NULL when compound fails.
8542 8550                   */
8543 8551                  if (dinfop)
8544 8552                          dinfo.di_garp =
8545 8553                              &res.array[11].nfs_resop4_u.opgetattr.ga_res;
8546 8554                  nfs4_update_dircaches(&rn_res->source_cinfo, odvp, NULL, NULL,
8547 8555                      dinfop);
8548 8556          }
8549 8557  
8550 8558          /*
8551 8559           * Update the rnode with the new component name and args,
8552 8560           * and if the file handle changed, also update it with the new fh.
8553 8561           * This is only necessary if the target object has an rnode
8554 8562           * entry and there is no need to create one for it.
8555 8563           */
8556 8564          resop = &res.array[8];  /* getfh new res */
8557 8565          ngf_res = &resop->nfs_resop4_u.opgetfh;
8558 8566  
8559 8567          /*
8560 8568           * Update the path and filehandle for the renamed object.
8561 8569           */
8562 8570          nfs4rename_update(ovp, ndvp, &ngf_res->object, nnm);
8563 8571  
8564 8572          nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, &recov_state, needrecov);
8565 8573  
8566 8574          if (res.status == NFS4_OK) {
8567 8575                  resop++;        /* getattr res */
8568 8576                  e.error = nfs4_update_attrcache(res.status,
8569 8577                      &resop->nfs_resop4_u.opgetattr.ga_res,
8570 8578                      t, ovp, cr);
8571 8579          }
8572 8580  
8573 8581  out:
8574 8582          kmem_free(argop, argoplist_size);
8575 8583          if (resp)
8576 8584                  (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
8577 8585          mutex_enter(&orp->r_statelock);
8578 8586          orp->r_flags &= ~R4RECEXPFH;
8579 8587          cv_broadcast(&orp->r_cv);
8580 8588          mutex_exit(&orp->r_statelock);
8581 8589  
8582 8590          return (e.error);
8583 8591  }
8584 8592  
8585 8593  /* ARGSUSED */
8586 8594  static int
8587 8595  nfs4_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp, cred_t *cr,
8588 8596      caller_context_t *ct, int flags, vsecattr_t *vsecp)
8589 8597  {
8590 8598          int error;
8591 8599          vnode_t *vp;
8592 8600  
8593 8601          if (nfs_zone() != VTOMI4(dvp)->mi_zone)
8594 8602                  return (EPERM);
8595 8603          /*
8596 8604           * As ".." has special meaning and rather than send a mkdir
8597 8605           * over the wire to just let the server freak out, we just
8598 8606           * short circuit it here and return EEXIST
8599 8607           */
8600 8608          if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0')
8601 8609                  return (EEXIST);
8602 8610  
8603 8611          /*
8604 8612           * Decision to get the right gid and setgid bit of the
8605 8613           * new directory is now made in call_nfs4_create_req.
8606 8614           */
8607 8615          va->va_mask |= AT_MODE;
8608 8616          error = call_nfs4_create_req(dvp, nm, NULL, va, &vp, cr, NF4DIR);
8609 8617          if (error)
8610 8618                  return (error);
8611 8619  
8612 8620          *vpp = vp;
8613 8621          return (0);
8614 8622  }
8615 8623  
8616 8624  
8617 8625  /*
8618 8626   * rmdir is using the same remove v4 op as does remove.
8619 8627   * Remove requires that the current fh be the target directory.
8620 8628   * After the operation, the current fh is unchanged.
8621 8629   * The compound op structure is:
8622 8630   *      PUTFH(targetdir), REMOVE
8623 8631   */
8624 8632  /*ARGSUSED4*/
8625 8633  static int
8626 8634  nfs4_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr,
8627 8635      caller_context_t *ct, int flags)
8628 8636  {
8629 8637          int need_end_op = FALSE;
8630 8638          COMPOUND4args_clnt args;
8631 8639          COMPOUND4res_clnt res, *resp = NULL;
8632 8640          REMOVE4res *rm_res;
8633 8641          nfs_argop4 argop[3];
8634 8642          nfs_resop4 *resop;
8635 8643          vnode_t *vp;
8636 8644          int doqueue;
8637 8645          mntinfo4_t *mi;
8638 8646          rnode4_t *drp;
8639 8647          bool_t needrecov = FALSE;
8640 8648          nfs4_recov_state_t recov_state;
8641 8649          nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
8642 8650          dirattr_info_t dinfo, *dinfop;
8643 8651  
8644 8652          if (nfs_zone() != VTOMI4(dvp)->mi_zone)
8645 8653                  return (EPERM);
8646 8654          /*
8647 8655           * As ".." has special meaning and rather than send a rmdir
8648 8656           * over the wire to just let the server freak out, we just
8649 8657           * short circuit it here and return EEXIST
8650 8658           */
8651 8659          if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0')
8652 8660                  return (EEXIST);
8653 8661  
8654 8662          drp = VTOR4(dvp);
8655 8663          if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp)))
8656 8664                  return (EINTR);
8657 8665  
8658 8666          /*
8659 8667           * Attempt to prevent a rmdir(".") from succeeding.
8660 8668           */
8661 8669          e.error = nfs4lookup(dvp, nm, &vp, cr, 0);
8662 8670          if (e.error) {
8663 8671                  nfs_rw_exit(&drp->r_rwlock);
8664 8672                  return (e.error);
8665 8673          }
8666 8674          if (vp == cdir) {
8667 8675                  VN_RELE(vp);
8668 8676                  nfs_rw_exit(&drp->r_rwlock);
8669 8677                  return (EINVAL);
8670 8678          }
8671 8679  
8672 8680          /*
8673 8681           * Since nfsv4 remove op works on both files and directories,
8674 8682           * check that the removed object is indeed a directory.
8675 8683           */
8676 8684          if (vp->v_type != VDIR) {
8677 8685                  VN_RELE(vp);
8678 8686                  nfs_rw_exit(&drp->r_rwlock);
8679 8687                  return (ENOTDIR);
8680 8688          }
8681 8689  
8682 8690          /*
8683 8691           * First just remove the entry from the name cache, as it
8684 8692           * is most likely an entry for this vp.
8685 8693           */
8686 8694          dnlc_remove(dvp, nm);
8687 8695  
8688 8696          /*
8689 8697           * If there vnode reference count is greater than one, then
8690 8698           * there may be additional references in the DNLC which will
8691 8699           * need to be purged.  First, trying removing the entry for
8692 8700           * the parent directory and see if that removes the additional
8693 8701           * reference(s).  If that doesn't do it, then use dnlc_purge_vp
8694 8702           * to completely remove any references to the directory which
8695 8703           * might still exist in the DNLC.
8696 8704           */
8697 8705          if (vp->v_count > 1) {
8698 8706                  dnlc_remove(vp, "..");
8699 8707                  if (vp->v_count > 1)
8700 8708                          dnlc_purge_vp(vp);
8701 8709          }
8702 8710  
8703 8711          mi = VTOMI4(dvp);
8704 8712          recov_state.rs_flags = 0;
8705 8713          recov_state.rs_num_retry_despite_err = 0;
8706 8714  
8707 8715  recov_retry:
8708 8716          args.ctag = TAG_RMDIR;
8709 8717  
8710 8718          /*
8711 8719           * Rmdir ops: putfh dir; remove
8712 8720           */
8713 8721          args.array_len = 3;
8714 8722          args.array = argop;
8715 8723  
8716 8724          e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state);
8717 8725          if (e.error) {
8718 8726                  nfs_rw_exit(&drp->r_rwlock);
8719 8727                  return (e.error);
8720 8728          }
8721 8729          need_end_op = TRUE;
8722 8730  
8723 8731          /* putfh directory */
8724 8732          argop[0].argop = OP_CPUTFH;
8725 8733          argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
8726 8734  
8727 8735          /* remove */
8728 8736          argop[1].argop = OP_CREMOVE;
8729 8737          argop[1].nfs_argop4_u.opcremove.ctarget = nm;
8730 8738  
8731 8739          /* getattr (postop attrs for dir that contained removed dir) */
8732 8740          argop[2].argop = OP_GETATTR;
8733 8741          argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
8734 8742          argop[2].nfs_argop4_u.opgetattr.mi = mi;
8735 8743  
8736 8744          dinfo.di_time_call = gethrtime();
8737 8745          doqueue = 1;
8738 8746          rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
8739 8747  
8740 8748          PURGE_ATTRCACHE4(vp);
8741 8749  
8742 8750          needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
8743 8751          if (e.error) {
8744 8752                  PURGE_ATTRCACHE4(dvp);
8745 8753          }
8746 8754  
8747 8755          if (needrecov) {
8748 8756                  if (nfs4_start_recovery(&e, VTOMI4(dvp), dvp, NULL, NULL,
8749 8757                      NULL, OP_REMOVE, NULL, NULL, NULL) == FALSE) {
8750 8758                          if (!e.error)
8751 8759                                  (void) xdr_free(xdr_COMPOUND4res_clnt,
8752 8760                                      (caddr_t)&res);
8753 8761  
8754 8762                          nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state,
8755 8763                              needrecov);
8756 8764                          need_end_op = FALSE;
8757 8765                          goto recov_retry;
8758 8766                  }
8759 8767          }
8760 8768  
8761 8769          if (!e.error) {
8762 8770                  resp = &res;
8763 8771  
8764 8772                  /*
8765 8773                   * Only return error if first 2 ops (OP_REMOVE or earlier)
8766 8774                   * failed.
8767 8775                   */
8768 8776                  if (res.status != NFS4_OK && res.array_len <= 2) {
8769 8777                          e.error = geterrno4(res.status);
8770 8778                          PURGE_ATTRCACHE4(dvp);
8771 8779                          nfs4_end_op(VTOMI4(dvp), dvp, NULL,
8772 8780                              &recov_state, needrecov);
8773 8781                          need_end_op = FALSE;
8774 8782                          nfs4_purge_stale_fh(e.error, dvp, cr);
8775 8783                          /*
8776 8784                           * System V defines rmdir to return EEXIST, not
8777 8785                           * ENOTEMPTY if the directory is not empty.  Over
8778 8786                           * the wire, the error is NFSERR_ENOTEMPTY which
8779 8787                           * geterrno4 maps to ENOTEMPTY.
8780 8788                           */
8781 8789                          if (e.error == ENOTEMPTY)
8782 8790                                  e.error = EEXIST;
8783 8791                  } else {
8784 8792                          resop = &res.array[1];  /* remove res */
8785 8793                          rm_res = &resop->nfs_resop4_u.opremove;
8786 8794  
8787 8795                          if (res.status == NFS4_OK) {
8788 8796                                  resop = &res.array[2];  /* dir attrs */
8789 8797                                  dinfo.di_garp =
8790 8798                                      &resop->nfs_resop4_u.opgetattr.ga_res;
8791 8799                                  dinfo.di_cred = cr;
8792 8800                                  dinfop = &dinfo;
8793 8801                          } else
8794 8802                                  dinfop = NULL;
8795 8803  
8796 8804                          /* Update dir attribute, readdir and dnlc caches */
8797 8805                          nfs4_update_dircaches(&rm_res->cinfo, dvp, NULL, NULL,
8798 8806                              dinfop);
8799 8807  
8800 8808                          /* destroy rddir cache for dir that was removed */
8801 8809                          if (VTOR4(vp)->r_dir != NULL)
8802 8810                                  nfs4_purge_rddir_cache(vp);
8803 8811                  }
8804 8812          }
8805 8813  
8806 8814          if (need_end_op)
8807 8815                  nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
8808 8816  
8809 8817          nfs_rw_exit(&drp->r_rwlock);
8810 8818  
8811 8819          if (resp)
8812 8820                  (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
8813 8821  
8814 8822          if (e.error == 0) {
8815 8823                  vnode_t *tvp;
8816 8824                  rnode4_t *trp;
8817 8825                  trp = VTOR4(vp);
8818 8826                  tvp = vp;
8819 8827                  if (IS_SHADOW(vp, trp))
8820 8828                          tvp = RTOV4(trp);
8821 8829                  vnevent_rmdir(tvp, dvp, nm, ct);
8822 8830          }
8823 8831  
8824 8832          VN_RELE(vp);
8825 8833  
8826 8834          return (e.error);
8827 8835  }
8828 8836  
8829 8837  /* ARGSUSED */
8830 8838  static int
8831 8839  nfs4_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm, cred_t *cr,
8832 8840      caller_context_t *ct, int flags)
8833 8841  {
8834 8842          int error;
8835 8843          vnode_t *vp;
8836 8844          rnode4_t *rp;
8837 8845          char *contents;
8838 8846          mntinfo4_t *mi = VTOMI4(dvp);
8839 8847  
8840 8848          if (nfs_zone() != mi->mi_zone)
8841 8849                  return (EPERM);
8842 8850          if (!(mi->mi_flags & MI4_SYMLINK))
8843 8851                  return (EOPNOTSUPP);
8844 8852  
8845 8853          error = call_nfs4_create_req(dvp, lnm, tnm, tva, &vp, cr, NF4LNK);
8846 8854          if (error)
8847 8855                  return (error);
8848 8856  
8849 8857          ASSERT(nfs4_consistent_type(vp));
8850 8858          rp = VTOR4(vp);
8851 8859          if (nfs4_do_symlink_cache && rp->r_symlink.contents == NULL) {
8852 8860  
8853 8861                  contents = kmem_alloc(MAXPATHLEN, KM_SLEEP);
8854 8862  
8855 8863                  if (contents != NULL) {
8856 8864                          mutex_enter(&rp->r_statelock);
8857 8865                          if (rp->r_symlink.contents == NULL) {
8858 8866                                  rp->r_symlink.len = strlen(tnm);
8859 8867                                  bcopy(tnm, contents, rp->r_symlink.len);
8860 8868                                  rp->r_symlink.contents = contents;
8861 8869                                  rp->r_symlink.size = MAXPATHLEN;
8862 8870                                  mutex_exit(&rp->r_statelock);
8863 8871                          } else {
8864 8872                                  mutex_exit(&rp->r_statelock);
8865 8873                                  kmem_free((void *)contents, MAXPATHLEN);
8866 8874                          }
8867 8875                  }
8868 8876          }
8869 8877          VN_RELE(vp);
8870 8878  
8871 8879          return (error);
8872 8880  }
8873 8881  
8874 8882  
8875 8883  /*
8876 8884   * Read directory entries.
8877 8885   * There are some weird things to look out for here.  The uio_loffset
8878 8886   * field is either 0 or it is the offset returned from a previous
8879 8887   * readdir.  It is an opaque value used by the server to find the
8880 8888   * correct directory block to read. The count field is the number
8881 8889   * of blocks to read on the server.  This is advisory only, the server
8882 8890   * may return only one block's worth of entries.  Entries may be compressed
8883 8891   * on the server.
8884 8892   */
8885 8893  /* ARGSUSED */
8886 8894  static int
8887 8895  nfs4_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp,
8888 8896          caller_context_t *ct, int flags)
8889 8897  {
8890 8898          int error;
8891 8899          uint_t count;
8892 8900          rnode4_t *rp;
8893 8901          rddir4_cache *rdc;
8894 8902          rddir4_cache *rrdc;
8895 8903  
8896 8904          if (nfs_zone() != VTOMI4(vp)->mi_zone)
8897 8905                  return (EIO);
8898 8906          rp = VTOR4(vp);
8899 8907  
8900 8908          ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
8901 8909  
8902 8910          /*
8903 8911           * Make sure that the directory cache is valid.
8904 8912           */
8905 8913          if (rp->r_dir != NULL) {
8906 8914                  if (nfs_disable_rddir_cache != 0) {
8907 8915                          /*
8908 8916                           * Setting nfs_disable_rddir_cache in /etc/system
8909 8917                           * allows interoperability with servers that do not
8910 8918                           * properly update the attributes of directories.
8911 8919                           * Any cached information gets purged before an
8912 8920                           * access is made to it.
8913 8921                           */
8914 8922                          nfs4_purge_rddir_cache(vp);
8915 8923                  }
8916 8924  
8917 8925                  error = nfs4_validate_caches(vp, cr);
8918 8926                  if (error)
8919 8927                          return (error);
8920 8928          }
8921 8929  
8922 8930          count = MIN(uiop->uio_iov->iov_len, MAXBSIZE);
8923 8931  
8924 8932          /*
8925 8933           * Short circuit last readdir which always returns 0 bytes.
8926 8934           * This can be done after the directory has been read through
8927 8935           * completely at least once.  This will set r_direof which
8928 8936           * can be used to find the value of the last cookie.
8929 8937           */
8930 8938          mutex_enter(&rp->r_statelock);
8931 8939          if (rp->r_direof != NULL &&
8932 8940              uiop->uio_loffset == rp->r_direof->nfs4_ncookie) {
8933 8941                  mutex_exit(&rp->r_statelock);
8934 8942  #ifdef DEBUG
8935 8943                  nfs4_readdir_cache_shorts++;
8936 8944  #endif
8937 8945                  if (eofp)
8938 8946                          *eofp = 1;
8939 8947                  return (0);
8940 8948          }
8941 8949  
8942 8950          /*
8943 8951           * Look for a cache entry.  Cache entries are identified
8944 8952           * by the NFS cookie value and the byte count requested.
8945 8953           */
8946 8954          rdc = rddir4_cache_lookup(rp, uiop->uio_loffset, count);
8947 8955  
8948 8956          /*
8949 8957           * If rdc is NULL then the lookup resulted in an unrecoverable error.
8950 8958           */
8951 8959          if (rdc == NULL) {
8952 8960                  mutex_exit(&rp->r_statelock);
8953 8961                  return (EINTR);
8954 8962          }
8955 8963  
8956 8964          /*
8957 8965           * Check to see if we need to fill this entry in.
8958 8966           */
8959 8967          if (rdc->flags & RDDIRREQ) {
8960 8968                  rdc->flags &= ~RDDIRREQ;
8961 8969                  rdc->flags |= RDDIR;
8962 8970                  mutex_exit(&rp->r_statelock);
8963 8971  
8964 8972                  /*
8965 8973                   * Do the readdir.
8966 8974                   */
8967 8975                  nfs4readdir(vp, rdc, cr);
8968 8976  
8969 8977                  /*
8970 8978                   * Reacquire the lock, so that we can continue
8971 8979                   */
8972 8980                  mutex_enter(&rp->r_statelock);
8973 8981                  /*
8974 8982                   * The entry is now complete
8975 8983                   */
8976 8984                  rdc->flags &= ~RDDIR;
8977 8985          }
8978 8986  
8979 8987          ASSERT(!(rdc->flags & RDDIR));
8980 8988  
8981 8989          /*
8982 8990           * If an error occurred while attempting
8983 8991           * to fill the cache entry, mark the entry invalid and
8984 8992           * just return the error.
8985 8993           */
8986 8994          if (rdc->error) {
8987 8995                  error = rdc->error;
8988 8996                  rdc->flags |= RDDIRREQ;
8989 8997                  rddir4_cache_rele(rp, rdc);
8990 8998                  mutex_exit(&rp->r_statelock);
8991 8999                  return (error);
8992 9000          }
8993 9001  
8994 9002          /*
8995 9003           * The cache entry is complete and good,
8996 9004           * copyout the dirent structs to the calling
8997 9005           * thread.
8998 9006           */
8999 9007          error = uiomove(rdc->entries, rdc->actlen, UIO_READ, uiop);
9000 9008  
9001 9009          /*
9002 9010           * If no error occurred during the copyout,
9003 9011           * update the offset in the uio struct to
9004 9012           * contain the value of the next NFS 4 cookie
9005 9013           * and set the eof value appropriately.
9006 9014           */
9007 9015          if (!error) {
9008 9016                  uiop->uio_loffset = rdc->nfs4_ncookie;
9009 9017                  if (eofp)
9010 9018                          *eofp = rdc->eof;
9011 9019          }
9012 9020  
9013 9021          /*
9014 9022           * Decide whether to do readahead.  Don't if we
9015 9023           * have already read to the end of directory.
9016 9024           */
9017 9025          if (rdc->eof) {
9018 9026                  /*
9019 9027                   * Make the entry the direof only if it is cached
9020 9028                   */
9021 9029                  if (rdc->flags & RDDIRCACHED)
9022 9030                          rp->r_direof = rdc;
9023 9031                  rddir4_cache_rele(rp, rdc);
9024 9032                  mutex_exit(&rp->r_statelock);
9025 9033                  return (error);
9026 9034          }
9027 9035  
9028 9036          /* Determine if a readdir readahead should be done */
9029 9037          if (!(rp->r_flags & R4LOOKUP)) {
9030 9038                  rddir4_cache_rele(rp, rdc);
9031 9039                  mutex_exit(&rp->r_statelock);
9032 9040                  return (error);
9033 9041          }
9034 9042  
9035 9043          /*
9036 9044           * Now look for a readahead entry.
9037 9045           *
9038 9046           * Check to see whether we found an entry for the readahead.
9039 9047           * If so, we don't need to do anything further, so free the new
9040 9048           * entry if one was allocated.  Otherwise, allocate a new entry, add
9041 9049           * it to the cache, and then initiate an asynchronous readdir
9042 9050           * operation to fill it.
9043 9051           */
9044 9052          rrdc = rddir4_cache_lookup(rp, rdc->nfs4_ncookie, count);
9045 9053  
9046 9054          /*
9047 9055           * A readdir cache entry could not be obtained for the readahead.  In
9048 9056           * this case we skip the readahead and return.
9049 9057           */
9050 9058          if (rrdc == NULL) {
9051 9059                  rddir4_cache_rele(rp, rdc);
9052 9060                  mutex_exit(&rp->r_statelock);
9053 9061                  return (error);
9054 9062          }
9055 9063  
9056 9064          /*
9057 9065           * Check to see if we need to fill this entry in.
9058 9066           */
9059 9067          if (rrdc->flags & RDDIRREQ) {
9060 9068                  rrdc->flags &= ~RDDIRREQ;
9061 9069                  rrdc->flags |= RDDIR;
9062 9070                  rddir4_cache_rele(rp, rdc);
9063 9071                  mutex_exit(&rp->r_statelock);
9064 9072  #ifdef DEBUG
9065 9073                  nfs4_readdir_readahead++;
9066 9074  #endif
9067 9075                  /*
9068 9076                   * Do the readdir.
9069 9077                   */
9070 9078                  nfs4_async_readdir(vp, rrdc, cr, do_nfs4readdir);
9071 9079                  return (error);
9072 9080          }
9073 9081  
9074 9082          rddir4_cache_rele(rp, rrdc);
9075 9083          rddir4_cache_rele(rp, rdc);
9076 9084          mutex_exit(&rp->r_statelock);
9077 9085          return (error);
9078 9086  }
9079 9087  
9080 9088  static int
9081 9089  do_nfs4readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr)
9082 9090  {
9083 9091          int error;
9084 9092          rnode4_t *rp;
9085 9093  
9086 9094          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
9087 9095  
9088 9096          rp = VTOR4(vp);
9089 9097  
9090 9098          /*
9091 9099           * Obtain the readdir results for the caller.
9092 9100           */
9093 9101          nfs4readdir(vp, rdc, cr);
9094 9102  
9095 9103          mutex_enter(&rp->r_statelock);
9096 9104          /*
9097 9105           * The entry is now complete
9098 9106           */
9099 9107          rdc->flags &= ~RDDIR;
9100 9108  
9101 9109          error = rdc->error;
9102 9110          if (error)
9103 9111                  rdc->flags |= RDDIRREQ;
9104 9112          rddir4_cache_rele(rp, rdc);
9105 9113          mutex_exit(&rp->r_statelock);
9106 9114  
9107 9115          return (error);
9108 9116  }
9109 9117  
9110 9118  /*
9111 9119   * Read directory entries.
9112 9120   * There are some weird things to look out for here.  The uio_loffset
9113 9121   * field is either 0 or it is the offset returned from a previous
9114 9122   * readdir.  It is an opaque value used by the server to find the
9115 9123   * correct directory block to read. The count field is the number
9116 9124   * of blocks to read on the server.  This is advisory only, the server
9117 9125   * may return only one block's worth of entries.  Entries may be compressed
9118 9126   * on the server.
9119 9127   *
9120 9128   * Generates the following compound request:
9121 9129   * 1. If readdir offset is zero and no dnlc entry for parent exists,
9122 9130   *    must include a Lookupp as well. In this case, send:
9123 9131   *    { Putfh <fh>; Readdir; Lookupp; Getfh; Getattr }
9124 9132   * 2. Otherwise just do: { Putfh <fh>; Readdir }
9125 9133   *
9126 9134   * Get complete attributes and filehandles for entries if this is the
9127 9135   * first read of the directory. Otherwise, just get fileid's.
9128 9136   */
9129 9137  static void
9130 9138  nfs4readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr)
9131 9139  {
9132 9140          COMPOUND4args_clnt args;
9133 9141          COMPOUND4res_clnt res;
9134 9142          READDIR4args *rargs;
9135 9143          READDIR4res_clnt *rd_res;
9136 9144          bitmap4 rd_bitsval;
9137 9145          nfs_argop4 argop[5];
9138 9146          nfs_resop4 *resop;
9139 9147          rnode4_t *rp = VTOR4(vp);
9140 9148          mntinfo4_t *mi = VTOMI4(vp);
9141 9149          int doqueue;
9142 9150          u_longlong_t nodeid, pnodeid;   /* id's of dir and its parents */
9143 9151          vnode_t *dvp;
9144 9152          nfs_cookie4 cookie = (nfs_cookie4)rdc->nfs4_cookie;
9145 9153          int num_ops, res_opcnt;
9146 9154          bool_t needrecov = FALSE;
9147 9155          nfs4_recov_state_t recov_state;
9148 9156          hrtime_t t;
9149 9157          nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
9150 9158  
9151 9159          ASSERT(nfs_zone() == mi->mi_zone);
9152 9160          ASSERT(rdc->flags & RDDIR);
9153 9161          ASSERT(rdc->entries == NULL);
9154 9162  
9155 9163          /*
9156 9164           * If rp were a stub, it should have triggered and caused
9157 9165           * a mount for us to get this far.
9158 9166           */
9159 9167          ASSERT(!RP_ISSTUB(rp));
9160 9168  
9161 9169          num_ops = 2;
9162 9170          if (cookie == (nfs_cookie4)0 || cookie == (nfs_cookie4)1) {
9163 9171                  /*
9164 9172                   * Since nfsv4 readdir may not return entries for "." and "..",
9165 9173                   * the client must recreate them:
9166 9174                   * To find the correct nodeid, do the following:
9167 9175                   * For current node, get nodeid from dnlc.
9168 9176                   * - if current node is rootvp, set pnodeid to nodeid.
9169 9177                   * - else if parent is in the dnlc, get its nodeid from there.
9170 9178                   * - else add LOOKUPP+GETATTR to compound.
9171 9179                   */
9172 9180                  nodeid = rp->r_attr.va_nodeid;
9173 9181                  if (vp->v_flag & VROOT) {
9174 9182                          pnodeid = nodeid;       /* root of mount point */
9175 9183                  } else {
9176 9184                          dvp = dnlc_lookup(vp, "..");
9177 9185                          if (dvp != NULL && dvp != DNLC_NO_VNODE) {
9178 9186                                  /* parent in dnlc cache - no need for otw */
9179 9187                                  pnodeid = VTOR4(dvp)->r_attr.va_nodeid;
9180 9188                          } else {
9181 9189                                  /*
9182 9190                                   * parent not in dnlc cache,
9183 9191                                   * do lookupp to get its id
9184 9192                                   */
9185 9193                                  num_ops = 5;
9186 9194                                  pnodeid = 0; /* set later by getattr parent */
9187 9195                          }
9188 9196                          if (dvp)
9189 9197                                  VN_RELE(dvp);
9190 9198                  }
9191 9199          }
9192 9200          recov_state.rs_flags = 0;
9193 9201          recov_state.rs_num_retry_despite_err = 0;
9194 9202  
9195 9203          /* Save the original mount point security flavor */
9196 9204          (void) save_mnt_secinfo(mi->mi_curr_serv);
9197 9205  
9198 9206  recov_retry:
9199 9207          args.ctag = TAG_READDIR;
9200 9208  
9201 9209          args.array = argop;
9202 9210          args.array_len = num_ops;
9203 9211  
9204 9212          if (e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_READDIR,
9205 9213              &recov_state, NULL)) {
9206 9214                  /*
9207 9215                   * If readdir a node that is a stub for a crossed mount point,
9208 9216                   * keep the original secinfo flavor for the current file
9209 9217                   * system, not the crossed one.
9210 9218                   */
9211 9219                  (void) check_mnt_secinfo(mi->mi_curr_serv, vp);
9212 9220                  rdc->error = e.error;
9213 9221                  return;
9214 9222          }
9215 9223  
9216 9224          /*
9217 9225           * Determine which attrs to request for dirents.  This code
9218 9226           * must be protected by nfs4_start/end_fop because of r_server
9219 9227           * (which will change during failover recovery).
9220 9228           *
9221 9229           */
9222 9230          if (rp->r_flags & (R4LOOKUP | R4READDIRWATTR)) {
9223 9231                  /*
9224 9232                   * Get all vattr attrs plus filehandle and rdattr_error
9225 9233                   */
9226 9234                  rd_bitsval = NFS4_VATTR_MASK |
9227 9235                      FATTR4_RDATTR_ERROR_MASK |
9228 9236                      FATTR4_FILEHANDLE_MASK;
9229 9237  
9230 9238                  if (rp->r_flags & R4READDIRWATTR) {
9231 9239                          mutex_enter(&rp->r_statelock);
9232 9240                          rp->r_flags &= ~R4READDIRWATTR;
9233 9241                          mutex_exit(&rp->r_statelock);
9234 9242                  }
9235 9243          } else {
9236 9244                  servinfo4_t *svp = rp->r_server;
9237 9245  
9238 9246                  /*
9239 9247                   * Already read directory. Use readdir with
9240 9248                   * no attrs (except for mounted_on_fileid) for updates.
9241 9249                   */
9242 9250                  rd_bitsval = FATTR4_RDATTR_ERROR_MASK;
9243 9251  
9244 9252                  /*
9245 9253                   * request mounted on fileid if supported, else request
9246 9254                   * fileid.  maybe we should verify that fileid is supported
9247 9255                   * and request something else if not.
9248 9256                   */
9249 9257                  (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
9250 9258                  if (svp->sv_supp_attrs & FATTR4_MOUNTED_ON_FILEID_MASK)
9251 9259                          rd_bitsval |= FATTR4_MOUNTED_ON_FILEID_MASK;
9252 9260                  nfs_rw_exit(&svp->sv_lock);
9253 9261          }
9254 9262  
9255 9263          /* putfh directory fh */
9256 9264          argop[0].argop = OP_CPUTFH;
9257 9265          argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
9258 9266  
9259 9267          argop[1].argop = OP_READDIR;
9260 9268          rargs = &argop[1].nfs_argop4_u.opreaddir;
9261 9269          /*
9262 9270           * 1 and 2 are reserved for client "." and ".." entry offset.
9263 9271           * cookie 0 should be used over-the-wire to start reading at
9264 9272           * the beginning of the directory excluding "." and "..".
9265 9273           */
9266 9274          if (rdc->nfs4_cookie == 0 ||
9267 9275              rdc->nfs4_cookie == 1 ||
9268 9276              rdc->nfs4_cookie == 2) {
9269 9277                  rargs->cookie = (nfs_cookie4)0;
9270 9278                  rargs->cookieverf = 0;
9271 9279          } else {
9272 9280                  rargs->cookie = (nfs_cookie4)rdc->nfs4_cookie;
9273 9281                  mutex_enter(&rp->r_statelock);
9274 9282                  rargs->cookieverf = rp->r_cookieverf4;
9275 9283                  mutex_exit(&rp->r_statelock);
9276 9284          }
9277 9285          rargs->dircount = MIN(rdc->buflen, mi->mi_tsize);
9278 9286          rargs->maxcount = mi->mi_tsize;
9279 9287          rargs->attr_request = rd_bitsval;
9280 9288          rargs->rdc = rdc;
9281 9289          rargs->dvp = vp;
9282 9290          rargs->mi = mi;
9283 9291          rargs->cr = cr;
9284 9292  
9285 9293  
9286 9294          /*
9287 9295           * If count < than the minimum required, we return no entries
9288 9296           * and fail with EINVAL
9289 9297           */
9290 9298          if (rargs->dircount < (DIRENT64_RECLEN(1) + DIRENT64_RECLEN(2))) {
9291 9299                  rdc->error = EINVAL;
9292 9300                  goto out;
9293 9301          }
9294 9302  
9295 9303          if (args.array_len == 5) {
9296 9304                  /*
9297 9305                   * Add lookupp and getattr for parent nodeid.
9298 9306                   */
9299 9307                  argop[2].argop = OP_LOOKUPP;
9300 9308  
9301 9309                  argop[3].argop = OP_GETFH;
9302 9310  
9303 9311                  /* getattr parent */
9304 9312                  argop[4].argop = OP_GETATTR;
9305 9313                  argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
9306 9314                  argop[4].nfs_argop4_u.opgetattr.mi = mi;
9307 9315          }
9308 9316  
9309 9317          doqueue = 1;
9310 9318  
9311 9319          if (mi->mi_io_kstats) {
9312 9320                  mutex_enter(&mi->mi_lock);
9313 9321                  kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
9314 9322                  mutex_exit(&mi->mi_lock);
9315 9323          }
9316 9324  
9317 9325          /* capture the time of this call */
9318 9326          rargs->t = t = gethrtime();
9319 9327  
9320 9328          rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
9321 9329  
9322 9330          if (mi->mi_io_kstats) {
9323 9331                  mutex_enter(&mi->mi_lock);
9324 9332                  kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
9325 9333                  mutex_exit(&mi->mi_lock);
9326 9334          }
9327 9335  
9328 9336          needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
9329 9337  
9330 9338          /*
9331 9339           * If RPC error occurred and it isn't an error that
9332 9340           * triggers recovery, then go ahead and fail now.
9333 9341           */
9334 9342          if (e.error != 0 && !needrecov) {
9335 9343                  rdc->error = e.error;
9336 9344                  goto out;
9337 9345          }
9338 9346  
9339 9347          if (needrecov) {
9340 9348                  bool_t abort;
9341 9349  
9342 9350                  NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
9343 9351                      "nfs4readdir: initiating recovery.\n"));
9344 9352  
9345 9353                  abort = nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
9346 9354                      NULL, OP_READDIR, NULL, NULL, NULL);
9347 9355                  if (abort == FALSE) {
9348 9356                          nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_READDIR,
9349 9357                              &recov_state, needrecov);
9350 9358                          if (!e.error)
9351 9359                                  (void) xdr_free(xdr_COMPOUND4res_clnt,
9352 9360                                      (caddr_t)&res);
9353 9361                          if (rdc->entries != NULL) {
9354 9362                                  kmem_free(rdc->entries, rdc->entlen);
9355 9363                                  rdc->entries = NULL;
9356 9364                          }
9357 9365                          goto recov_retry;
9358 9366                  }
9359 9367  
9360 9368                  if (e.error != 0) {
9361 9369                          rdc->error = e.error;
9362 9370                          goto out;
9363 9371                  }
9364 9372  
9365 9373                  /* fall through for res.status case */
9366 9374          }
9367 9375  
9368 9376          res_opcnt = res.array_len;
9369 9377  
9370 9378          /*
9371 9379           * If compound failed first 2 ops (PUTFH+READDIR), then return
9372 9380           * failure here.  Subsequent ops are for filling out dot-dot
9373 9381           * dirent, and if they fail, we still want to give the caller
9374 9382           * the dirents returned by (the successful) READDIR op, so we need
9375 9383           * to silently ignore failure for subsequent ops (LOOKUPP+GETATTR).
9376 9384           *
9377 9385           * One example where PUTFH+READDIR ops would succeed but
9378 9386           * LOOKUPP+GETATTR would fail would be a dir that has r perm
9379 9387           * but lacks x.  In this case, a POSIX server's VOP_READDIR
9380 9388           * would succeed; however, VOP_LOOKUP(..) would fail since no
9381 9389           * x perm.  We need to come up with a non-vendor-specific way
9382 9390           * for a POSIX server to return d_ino from dotdot's dirent if
9383 9391           * client only requests mounted_on_fileid, and just say the
9384 9392           * LOOKUPP succeeded and fill out the GETATTR.  However, if
9385 9393           * client requested any mandatory attrs, server would be required
9386 9394           * to fail the GETATTR op because it can't call VOP_LOOKUP+VOP_GETATTR
9387 9395           * for dotdot.
9388 9396           */
9389 9397  
9390 9398          if (res.status) {
9391 9399                  if (res_opcnt <= 2) {
9392 9400                          e.error = geterrno4(res.status);
9393 9401                          nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_READDIR,
9394 9402                              &recov_state, needrecov);
9395 9403                          nfs4_purge_stale_fh(e.error, vp, cr);
9396 9404                          rdc->error = e.error;
9397 9405                          (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
9398 9406                          if (rdc->entries != NULL) {
9399 9407                                  kmem_free(rdc->entries, rdc->entlen);
9400 9408                                  rdc->entries = NULL;
9401 9409                          }
9402 9410                          /*
9403 9411                           * If readdir a node that is a stub for a
9404 9412                           * crossed mount point, keep the original
9405 9413                           * secinfo flavor for the current file system,
9406 9414                           * not the crossed one.
9407 9415                           */
9408 9416                          (void) check_mnt_secinfo(mi->mi_curr_serv, vp);
9409 9417                          return;
9410 9418                  }
9411 9419          }
9412 9420  
9413 9421          resop = &res.array[1];  /* readdir res */
9414 9422          rd_res = &resop->nfs_resop4_u.opreaddirclnt;
9415 9423  
9416 9424          mutex_enter(&rp->r_statelock);
9417 9425          rp->r_cookieverf4 = rd_res->cookieverf;
9418 9426          mutex_exit(&rp->r_statelock);
9419 9427  
9420 9428          /*
9421 9429           * For "." and ".." entries
9422 9430           * e.g.
9423 9431           *      seek(cookie=0) -> "." entry with d_off = 1
9424 9432           *      seek(cookie=1) -> ".." entry with d_off = 2
9425 9433           */
9426 9434          if (cookie == (nfs_cookie4) 0) {
9427 9435                  if (rd_res->dotp)
9428 9436                          rd_res->dotp->d_ino = nodeid;
9429 9437                  if (rd_res->dotdotp)
9430 9438                          rd_res->dotdotp->d_ino = pnodeid;
9431 9439          }
9432 9440          if (cookie == (nfs_cookie4) 1) {
9433 9441                  if (rd_res->dotdotp)
9434 9442                          rd_res->dotdotp->d_ino = pnodeid;
9435 9443          }
9436 9444  
9437 9445  
9438 9446          /* LOOKUPP+GETATTR attemped */
9439 9447          if (args.array_len == 5 && rd_res->dotdotp) {
9440 9448                  if (res.status == NFS4_OK && res_opcnt == 5) {
9441 9449                          nfs_fh4 *fhp;
9442 9450                          nfs4_sharedfh_t *sfhp;
9443 9451                          vnode_t *pvp;
9444 9452                          nfs4_ga_res_t *garp;
9445 9453  
9446 9454                          resop++;        /* lookupp */
9447 9455                          resop++;        /* getfh   */
9448 9456                          fhp = &resop->nfs_resop4_u.opgetfh.object;
9449 9457  
9450 9458                          resop++;        /* getattr of parent */
9451 9459  
9452 9460                          /*
9453 9461                           * First, take care of finishing the
9454 9462                           * readdir results.
9455 9463                           */
9456 9464                          garp = &resop->nfs_resop4_u.opgetattr.ga_res;
9457 9465                          /*
9458 9466                           * The d_ino of .. must be the inode number
9459 9467                           * of the mounted filesystem.
9460 9468                           */
9461 9469                          if (garp->n4g_va.va_mask & AT_NODEID)
9462 9470                                  rd_res->dotdotp->d_ino =
9463 9471                                      garp->n4g_va.va_nodeid;
9464 9472  
9465 9473  
9466 9474                          /*
9467 9475                           * Next, create the ".." dnlc entry
9468 9476                           */
9469 9477                          sfhp = sfh4_get(fhp, mi);
9470 9478                          if (!nfs4_make_dotdot(sfhp, t, vp, cr, &pvp, 0)) {
9471 9479                                  dnlc_update(vp, "..", pvp);
9472 9480                                  VN_RELE(pvp);
9473 9481                          }
9474 9482                          sfh4_rele(&sfhp);
9475 9483                  }
9476 9484          }
9477 9485  
9478 9486          if (mi->mi_io_kstats) {
9479 9487                  mutex_enter(&mi->mi_lock);
9480 9488                  KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
9481 9489                  KSTAT_IO_PTR(mi->mi_io_kstats)->nread += rdc->actlen;
9482 9490                  mutex_exit(&mi->mi_lock);
9483 9491          }
9484 9492  
9485 9493          (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
9486 9494  
9487 9495  out:
9488 9496          /*
9489 9497           * If readdir a node that is a stub for a crossed mount point,
9490 9498           * keep the original secinfo flavor for the current file system,
9491 9499           * not the crossed one.
9492 9500           */
9493 9501          (void) check_mnt_secinfo(mi->mi_curr_serv, vp);
9494 9502  
9495 9503          nfs4_end_fop(mi, vp, NULL, OH_READDIR, &recov_state, needrecov);
9496 9504  }
9497 9505  
9498 9506  
9499 9507  static int
9500 9508  nfs4_bio(struct buf *bp, stable_how4 *stab_comm, cred_t *cr, bool_t readahead)
9501 9509  {
9502 9510          rnode4_t *rp = VTOR4(bp->b_vp);
9503 9511          int count;
9504 9512          int error;
9505 9513          cred_t *cred_otw = NULL;
9506 9514          offset_t offset;
9507 9515          nfs4_open_stream_t *osp = NULL;
9508 9516          bool_t first_time = TRUE;       /* first time getting otw cred */
9509 9517          bool_t last_time = FALSE;       /* last time getting otw cred */
9510 9518  
9511 9519          ASSERT(nfs_zone() == VTOMI4(bp->b_vp)->mi_zone);
9512 9520  
9513 9521          DTRACE_IO1(start, struct buf *, bp);
9514 9522          offset = ldbtob(bp->b_lblkno);
9515 9523  
9516 9524          if (bp->b_flags & B_READ) {
9517 9525          read_again:
9518 9526                  /*
9519 9527                   * Releases the osp, if it is provided.
9520 9528                   * Puts a hold on the cred_otw and the new osp (if found).
9521 9529                   */
9522 9530                  cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp,
9523 9531                      &first_time, &last_time);
9524 9532                  error = bp->b_error = nfs4read(bp->b_vp, bp->b_un.b_addr,
9525 9533                      offset, bp->b_bcount, &bp->b_resid, cred_otw,
9526 9534                      readahead, NULL);
9527 9535                  crfree(cred_otw);
9528 9536                  if (!error) {
9529 9537                          if (bp->b_resid) {
9530 9538                                  /*
9531 9539                                   * Didn't get it all because we hit EOF,
9532 9540                                   * zero all the memory beyond the EOF.
9533 9541                                   */
9534 9542                                  /* bzero(rdaddr + */
9535 9543                                  bzero(bp->b_un.b_addr +
9536 9544                                      bp->b_bcount - bp->b_resid, bp->b_resid);
9537 9545                          }
9538 9546                          mutex_enter(&rp->r_statelock);
9539 9547                          if (bp->b_resid == bp->b_bcount &&
9540 9548                              offset >= rp->r_size) {
9541 9549                                  /*
9542 9550                                   * We didn't read anything at all as we are
9543 9551                                   * past EOF.  Return an error indicator back
9544 9552                                   * but don't destroy the pages (yet).
9545 9553                                   */
9546 9554                                  error = NFS_EOF;
9547 9555                          }
9548 9556                          mutex_exit(&rp->r_statelock);
9549 9557                  } else if (error == EACCES && last_time == FALSE) {
9550 9558                                  goto read_again;
9551 9559                  }
9552 9560          } else {
9553 9561                  if (!(rp->r_flags & R4STALE)) {
9554 9562  write_again:
9555 9563                          /*
9556 9564                           * Releases the osp, if it is provided.
9557 9565                           * Puts a hold on the cred_otw and the new
9558 9566                           * osp (if found).
9559 9567                           */
9560 9568                          cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp,
9561 9569                              &first_time, &last_time);
9562 9570                          mutex_enter(&rp->r_statelock);
9563 9571                          count = MIN(bp->b_bcount, rp->r_size - offset);
9564 9572                          mutex_exit(&rp->r_statelock);
9565 9573                          if (count < 0)
9566 9574                                  cmn_err(CE_PANIC, "nfs4_bio: write count < 0");
9567 9575  #ifdef DEBUG
9568 9576                          if (count == 0) {
9569 9577                                  zoneid_t zoneid = getzoneid();
9570 9578  
9571 9579                                  zcmn_err(zoneid, CE_WARN,
9572 9580                                      "nfs4_bio: zero length write at %lld",
9573 9581                                      offset);
9574 9582                                  zcmn_err(zoneid, CE_CONT, "flags=0x%x, "
9575 9583                                      "b_bcount=%ld, file size=%lld",
9576 9584                                      rp->r_flags, (long)bp->b_bcount,
9577 9585                                      rp->r_size);
9578 9586                                  sfh4_printfhandle(VTOR4(bp->b_vp)->r_fh);
9579 9587                                  if (nfs4_bio_do_stop)
9580 9588                                          debug_enter("nfs4_bio");
9581 9589                          }
9582 9590  #endif
9583 9591                          error = nfs4write(bp->b_vp, bp->b_un.b_addr, offset,
9584 9592                              count, cred_otw, stab_comm);
9585 9593                          if (error == EACCES && last_time == FALSE) {
9586 9594                                  crfree(cred_otw);
9587 9595                                  goto write_again;
9588 9596                          }
9589 9597                          bp->b_error = error;
9590 9598                          if (error && error != EINTR &&
9591 9599                              !(bp->b_vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) {
9592 9600                                  /*
9593 9601                                   * Don't print EDQUOT errors on the console.
9594 9602                                   * Don't print asynchronous EACCES errors.
9595 9603                                   * Don't print EFBIG errors.
9596 9604                                   * Print all other write errors.
9597 9605                                   */
9598 9606                                  if (error != EDQUOT && error != EFBIG &&
9599 9607                                      (error != EACCES ||
9600 9608                                      !(bp->b_flags & B_ASYNC)))
9601 9609                                          nfs4_write_error(bp->b_vp,
9602 9610                                              error, cred_otw);
9603 9611                                  /*
9604 9612                                   * Update r_error and r_flags as appropriate.
9605 9613                                   * If the error was ESTALE, then mark the
9606 9614                                   * rnode as not being writeable and save
9607 9615                                   * the error status.  Otherwise, save any
9608 9616                                   * errors which occur from asynchronous
9609 9617                                   * page invalidations.  Any errors occurring
9610 9618                                   * from other operations should be saved
9611 9619                                   * by the caller.
9612 9620                                   */
9613 9621                                  mutex_enter(&rp->r_statelock);
9614 9622                                  if (error == ESTALE) {
9615 9623                                          rp->r_flags |= R4STALE;
9616 9624                                          if (!rp->r_error)
9617 9625                                                  rp->r_error = error;
9618 9626                                  } else if (!rp->r_error &&
9619 9627                                      (bp->b_flags &
9620 9628                                      (B_INVAL|B_FORCE|B_ASYNC)) ==
9621 9629                                      (B_INVAL|B_FORCE|B_ASYNC)) {
9622 9630                                          rp->r_error = error;
9623 9631                                  }
9624 9632                                  mutex_exit(&rp->r_statelock);
9625 9633                          }
9626 9634                          crfree(cred_otw);
9627 9635                  } else {
9628 9636                          error = rp->r_error;
9629 9637                          /*
9630 9638                           * A close may have cleared r_error, if so,
9631 9639                           * propagate ESTALE error return properly
9632 9640                           */
9633 9641                          if (error == 0)
9634 9642                                  error = ESTALE;
9635 9643                  }
9636 9644          }
9637 9645  
9638 9646          if (error != 0 && error != NFS_EOF)
9639 9647                  bp->b_flags |= B_ERROR;
9640 9648  
9641 9649          if (osp)
9642 9650                  open_stream_rele(osp, rp);
9643 9651  
9644 9652          DTRACE_IO1(done, struct buf *, bp);
9645 9653  
9646 9654          return (error);
9647 9655  }
9648 9656  
9649 9657  /* ARGSUSED */
9650 9658  int
9651 9659  nfs4_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
9652 9660  {
9653 9661          return (EREMOTE);
9654 9662  }
9655 9663  
9656 9664  /* ARGSUSED2 */
9657 9665  int
9658 9666  nfs4_rwlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
9659 9667  {
9660 9668          rnode4_t *rp = VTOR4(vp);
9661 9669  
9662 9670          if (!write_lock) {
9663 9671                  (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
9664 9672                  return (V_WRITELOCK_FALSE);
9665 9673          }
9666 9674  
9667 9675          if ((rp->r_flags & R4DIRECTIO) ||
9668 9676              (VTOMI4(vp)->mi_flags & MI4_DIRECTIO)) {
9669 9677                  (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
9670 9678                  if (rp->r_mapcnt == 0 && !nfs4_has_pages(vp))
9671 9679                          return (V_WRITELOCK_FALSE);
9672 9680                  nfs_rw_exit(&rp->r_rwlock);
9673 9681          }
9674 9682  
9675 9683          (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, FALSE);
9676 9684          return (V_WRITELOCK_TRUE);
9677 9685  }
9678 9686  
9679 9687  /* ARGSUSED */
9680 9688  void
9681 9689  nfs4_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
9682 9690  {
9683 9691          rnode4_t *rp = VTOR4(vp);
9684 9692  
9685 9693          nfs_rw_exit(&rp->r_rwlock);
9686 9694  }
9687 9695  
9688 9696  /* ARGSUSED */
9689 9697  static int
9690 9698  nfs4_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct)
9691 9699  {
9692 9700          if (nfs_zone() != VTOMI4(vp)->mi_zone)
9693 9701                  return (EIO);
9694 9702  
9695 9703          /*
9696 9704           * Because we stuff the readdir cookie into the offset field
9697 9705           * someone may attempt to do an lseek with the cookie which
9698 9706           * we want to succeed.
9699 9707           */
9700 9708          if (vp->v_type == VDIR)
9701 9709                  return (0);
9702 9710          if (*noffp < 0)
9703 9711                  return (EINVAL);
9704 9712          return (0);
9705 9713  }
9706 9714  
9707 9715  
9708 9716  /*
9709 9717   * Return all the pages from [off..off+len) in file
9710 9718   */
9711 9719  /* ARGSUSED */
9712 9720  static int
9713 9721  nfs4_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
9714 9722      page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
9715 9723          enum seg_rw rw, cred_t *cr, caller_context_t *ct)
9716 9724  {
9717 9725          rnode4_t *rp;
9718 9726          int error;
9719 9727          mntinfo4_t *mi;
9720 9728  
9721 9729          if (nfs_zone() != VTOMI4(vp)->mi_zone)
9722 9730                  return (EIO);
9723 9731          rp = VTOR4(vp);
9724 9732          if (IS_SHADOW(vp, rp))
9725 9733                  vp = RTOV4(rp);
9726 9734  
9727 9735          if (vp->v_flag & VNOMAP)
9728 9736                  return (ENOSYS);
9729 9737  
9730 9738          if (protp != NULL)
9731 9739                  *protp = PROT_ALL;
9732 9740  
9733 9741          /*
9734 9742           * Now validate that the caches are up to date.
9735 9743           */
9736 9744          if (error = nfs4_validate_caches(vp, cr))
9737 9745                  return (error);
9738 9746  
9739 9747          mi = VTOMI4(vp);
9740 9748  retry:
9741 9749          mutex_enter(&rp->r_statelock);
9742 9750  
9743 9751          /*
9744 9752           * Don't create dirty pages faster than they
9745 9753           * can be cleaned so that the system doesn't
9746 9754           * get imbalanced.  If the async queue is
9747 9755           * maxed out, then wait for it to drain before
9748 9756           * creating more dirty pages.  Also, wait for
9749 9757           * any threads doing pagewalks in the vop_getattr
9750 9758           * entry points so that they don't block for
9751 9759           * long periods.
9752 9760           */
9753 9761          if (rw == S_CREATE) {
9754 9762                  while ((mi->mi_max_threads != 0 &&
9755 9763                      rp->r_awcount > 2 * mi->mi_max_threads) ||
9756 9764                      rp->r_gcount > 0)
9757 9765                          cv_wait(&rp->r_cv, &rp->r_statelock);
9758 9766          }
9759 9767  
9760 9768          /*
9761 9769           * If we are getting called as a side effect of an nfs_write()
9762 9770           * operation the local file size might not be extended yet.
9763 9771           * In this case we want to be able to return pages of zeroes.
9764 9772           */
9765 9773          if (off + len > rp->r_size + PAGEOFFSET && seg != segkmap) {
9766 9774                  NFS4_DEBUG(nfs4_pageio_debug,
9767 9775                      (CE_NOTE, "getpage beyond EOF: off=%lld, "
9768 9776                      "len=%llu, size=%llu, attrsize =%llu", off,
9769 9777                      (u_longlong_t)len, rp->r_size, rp->r_attr.va_size));
9770 9778                  mutex_exit(&rp->r_statelock);
9771 9779                  return (EFAULT);                /* beyond EOF */
9772 9780          }
9773 9781  
9774 9782          mutex_exit(&rp->r_statelock);
9775 9783  
9776 9784          if (len <= PAGESIZE) {
9777 9785                  error = nfs4_getapage(vp, off, len, protp, pl, plsz,
9778 9786                      seg, addr, rw, cr);
9779 9787                  NFS4_DEBUG(nfs4_pageio_debug && error,
9780 9788                      (CE_NOTE, "getpage error %d; off=%lld, "
9781 9789                      "len=%lld", error, off, (u_longlong_t)len));
9782 9790          } else {
9783 9791                  error = pvn_getpages(nfs4_getapage, vp, off, len, protp,
9784 9792                      pl, plsz, seg, addr, rw, cr);
9785 9793                  NFS4_DEBUG(nfs4_pageio_debug && error,
9786 9794                      (CE_NOTE, "getpages error %d; off=%lld, "
9787 9795                      "len=%lld", error, off, (u_longlong_t)len));
9788 9796          }
9789 9797  
9790 9798          switch (error) {
9791 9799          case NFS_EOF:
9792 9800                  nfs4_purge_caches(vp, NFS4_NOPURGE_DNLC, cr, FALSE);
9793 9801                  goto retry;
9794 9802          case ESTALE:
9795 9803                  nfs4_purge_stale_fh(error, vp, cr);
9796 9804          }
9797 9805  
9798 9806          return (error);
9799 9807  }
9800 9808  
9801 9809  /*
9802 9810   * Called from pvn_getpages or nfs4_getpage to get a particular page.
9803 9811   */
9804 9812  /* ARGSUSED */
9805 9813  static int
9806 9814  nfs4_getapage(vnode_t *vp, u_offset_t off, size_t len, uint_t *protp,
9807 9815      page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
9808 9816      enum seg_rw rw, cred_t *cr)
9809 9817  {
9810 9818          rnode4_t *rp;
9811 9819          uint_t bsize;
9812 9820          struct buf *bp;
9813 9821          page_t *pp;
9814 9822          u_offset_t lbn;
9815 9823          u_offset_t io_off;
9816 9824          u_offset_t blkoff;
9817 9825          u_offset_t rablkoff;
9818 9826          size_t io_len;
9819 9827          uint_t blksize;
9820 9828          int error;
9821 9829          int readahead;
9822 9830          int readahead_issued = 0;
9823 9831          int ra_window; /* readahead window */
9824 9832          page_t *pagefound;
9825 9833          page_t *savepp;
9826 9834  
9827 9835          if (nfs_zone() != VTOMI4(vp)->mi_zone)
9828 9836                  return (EIO);
9829 9837  
9830 9838          rp = VTOR4(vp);
9831 9839          ASSERT(!IS_SHADOW(vp, rp));
9832 9840          bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
9833 9841  
9834 9842  reread:
9835 9843          bp = NULL;
9836 9844          pp = NULL;
9837 9845          pagefound = NULL;
9838 9846  
9839 9847          if (pl != NULL)
9840 9848                  pl[0] = NULL;
9841 9849  
9842 9850          error = 0;
9843 9851          lbn = off / bsize;
9844 9852          blkoff = lbn * bsize;
9845 9853  
9846 9854          /*
9847 9855           * Queueing up the readahead before doing the synchronous read
9848 9856           * results in a significant increase in read throughput because
9849 9857           * of the increased parallelism between the async threads and
9850 9858           * the process context.
9851 9859           */
9852 9860          if ((off & ((vp->v_vfsp->vfs_bsize) - 1)) == 0 &&
9853 9861              rw != S_CREATE &&
9854 9862              !(vp->v_flag & VNOCACHE)) {
9855 9863                  mutex_enter(&rp->r_statelock);
9856 9864  
9857 9865                  /*
9858 9866                   * Calculate the number of readaheads to do.
9859 9867                   * a) No readaheads at offset = 0.
9860 9868                   * b) Do maximum(nfs4_nra) readaheads when the readahead
9861 9869                   *    window is closed.
9862 9870                   * c) Do readaheads between 1 to (nfs4_nra - 1) depending
9863 9871                   *    upon how far the readahead window is open or close.
9864 9872                   * d) No readaheads if rp->r_nextr is not within the scope
9865 9873                   *    of the readahead window (random i/o).
9866 9874                   */
9867 9875  
9868 9876                  if (off == 0)
9869 9877                          readahead = 0;
9870 9878                  else if (blkoff == rp->r_nextr)
9871 9879                          readahead = nfs4_nra;
9872 9880                  else if (rp->r_nextr > blkoff &&
9873 9881                      ((ra_window = (rp->r_nextr - blkoff) / bsize)
9874 9882                      <= (nfs4_nra - 1)))
9875 9883                          readahead = nfs4_nra - ra_window;
9876 9884                  else
9877 9885                          readahead = 0;
9878 9886  
9879 9887                  rablkoff = rp->r_nextr;
9880 9888                  while (readahead > 0 && rablkoff + bsize < rp->r_size) {
9881 9889                          mutex_exit(&rp->r_statelock);
9882 9890                          if (nfs4_async_readahead(vp, rablkoff + bsize,
9883 9891                              addr + (rablkoff + bsize - off),
9884 9892                              seg, cr, nfs4_readahead) < 0) {
9885 9893                                  mutex_enter(&rp->r_statelock);
9886 9894                                  break;
9887 9895                          }
9888 9896                          readahead--;
9889 9897                          rablkoff += bsize;
9890 9898                          /*
9891 9899                           * Indicate that we did a readahead so
9892 9900                           * readahead offset is not updated
9893 9901                           * by the synchronous read below.
9894 9902                           */
9895 9903                          readahead_issued = 1;
9896 9904                          mutex_enter(&rp->r_statelock);
9897 9905                          /*
9898 9906                           * set readahead offset to
9899 9907                           * offset of last async readahead
9900 9908                           * request.
9901 9909                           */
9902 9910                          rp->r_nextr = rablkoff;
9903 9911                  }
9904 9912                  mutex_exit(&rp->r_statelock);
9905 9913          }
9906 9914  
9907 9915  again:
9908 9916          if ((pagefound = page_exists(vp, off)) == NULL) {
9909 9917                  if (pl == NULL) {
9910 9918                          (void) nfs4_async_readahead(vp, blkoff, addr, seg, cr,
9911 9919                              nfs4_readahead);
9912 9920                  } else if (rw == S_CREATE) {
9913 9921                          /*
9914 9922                           * Block for this page is not allocated, or the offset
9915 9923                           * is beyond the current allocation size, or we're
9916 9924                           * allocating a swap slot and the page was not found,
9917 9925                           * so allocate it and return a zero page.
9918 9926                           */
9919 9927                          if ((pp = page_create_va(vp, off,
9920 9928                              PAGESIZE, PG_WAIT, seg, addr)) == NULL)
9921 9929                                  cmn_err(CE_PANIC, "nfs4_getapage: page_create");
9922 9930                          io_len = PAGESIZE;
9923 9931                          mutex_enter(&rp->r_statelock);
9924 9932                          rp->r_nextr = off + PAGESIZE;
9925 9933                          mutex_exit(&rp->r_statelock);
9926 9934                  } else {
9927 9935                          /*
9928 9936                           * Need to go to server to get a block
9929 9937                           */
9930 9938                          mutex_enter(&rp->r_statelock);
9931 9939                          if (blkoff < rp->r_size &&
9932 9940                              blkoff + bsize > rp->r_size) {
9933 9941                                  /*
9934 9942                                   * If less than a block left in
9935 9943                                   * file read less than a block.
9936 9944                                   */
9937 9945                                  if (rp->r_size <= off) {
9938 9946                                          /*
9939 9947                                           * Trying to access beyond EOF,
9940 9948                                           * set up to get at least one page.
9941 9949                                           */
9942 9950                                          blksize = off + PAGESIZE - blkoff;
9943 9951                                  } else
9944 9952                                          blksize = rp->r_size - blkoff;
9945 9953                          } else if ((off == 0) ||
9946 9954                              (off != rp->r_nextr && !readahead_issued)) {
9947 9955                                  blksize = PAGESIZE;
9948 9956                                  blkoff = off; /* block = page here */
9949 9957                          } else
9950 9958                                  blksize = bsize;
9951 9959                          mutex_exit(&rp->r_statelock);
9952 9960  
9953 9961                          pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
9954 9962                              &io_len, blkoff, blksize, 0);
9955 9963  
9956 9964                          /*
9957 9965                           * Some other thread has entered the page,
9958 9966                           * so just use it.
9959 9967                           */
9960 9968                          if (pp == NULL)
9961 9969                                  goto again;
9962 9970  
9963 9971                          /*
9964 9972                           * Now round the request size up to page boundaries.
9965 9973                           * This ensures that the entire page will be
9966 9974                           * initialized to zeroes if EOF is encountered.
9967 9975                           */
9968 9976                          io_len = ptob(btopr(io_len));
9969 9977  
9970 9978                          bp = pageio_setup(pp, io_len, vp, B_READ);
9971 9979                          ASSERT(bp != NULL);
9972 9980  
9973 9981                          /*
9974 9982                           * pageio_setup should have set b_addr to 0.  This
9975 9983                           * is correct since we want to do I/O on a page
9976 9984                           * boundary.  bp_mapin will use this addr to calculate
9977 9985                           * an offset, and then set b_addr to the kernel virtual
9978 9986                           * address it allocated for us.
9979 9987                           */
9980 9988                          ASSERT(bp->b_un.b_addr == 0);
9981 9989  
9982 9990                          bp->b_edev = 0;
9983 9991                          bp->b_dev = 0;
9984 9992                          bp->b_lblkno = lbtodb(io_off);
9985 9993                          bp->b_file = vp;
9986 9994                          bp->b_offset = (offset_t)off;
9987 9995                          bp_mapin(bp);
9988 9996  
9989 9997                          /*
9990 9998                           * If doing a write beyond what we believe is EOF,
9991 9999                           * don't bother trying to read the pages from the
9992 10000                           * server, we'll just zero the pages here.  We
9993 10001                           * don't check that the rw flag is S_WRITE here
9994 10002                           * because some implementations may attempt a
9995 10003                           * read access to the buffer before copying data.
9996 10004                           */
9997 10005                          mutex_enter(&rp->r_statelock);
9998 10006                          if (io_off >= rp->r_size && seg == segkmap) {
9999 10007                                  mutex_exit(&rp->r_statelock);
10000 10008                                  bzero(bp->b_un.b_addr, io_len);
10001 10009                          } else {
10002 10010                                  mutex_exit(&rp->r_statelock);
10003 10011                                  error = nfs4_bio(bp, NULL, cr, FALSE);
10004 10012                          }
10005 10013  
10006 10014                          /*
10007 10015                           * Unmap the buffer before freeing it.
10008 10016                           */
10009 10017                          bp_mapout(bp);
10010 10018                          pageio_done(bp);
10011 10019  
10012 10020                          savepp = pp;
10013 10021                          do {
10014 10022                                  pp->p_fsdata = C_NOCOMMIT;
10015 10023                          } while ((pp = pp->p_next) != savepp);
10016 10024  
10017 10025                          if (error == NFS_EOF) {
10018 10026                                  /*
10019 10027                                   * If doing a write system call just return
10020 10028                                   * zeroed pages, else user tried to get pages
10021 10029                                   * beyond EOF, return error.  We don't check
10022 10030                                   * that the rw flag is S_WRITE here because
10023 10031                                   * some implementations may attempt a read
10024 10032                                   * access to the buffer before copying data.
10025 10033                                   */
10026 10034                                  if (seg == segkmap)
10027 10035                                          error = 0;
10028 10036                                  else
10029 10037                                          error = EFAULT;
10030 10038                          }
10031 10039  
10032 10040                          if (!readahead_issued && !error) {
10033 10041                                  mutex_enter(&rp->r_statelock);
10034 10042                                  rp->r_nextr = io_off + io_len;
10035 10043                                  mutex_exit(&rp->r_statelock);
10036 10044                          }
10037 10045                  }
10038 10046          }
10039 10047  
10040 10048  out:
10041 10049          if (pl == NULL)
10042 10050                  return (error);
10043 10051  
10044 10052          if (error) {
10045 10053                  if (pp != NULL)
10046 10054                          pvn_read_done(pp, B_ERROR);
10047 10055                  return (error);
10048 10056          }
10049 10057  
10050 10058          if (pagefound) {
10051 10059                  se_t se = (rw == S_CREATE ? SE_EXCL : SE_SHARED);
10052 10060  
10053 10061                  /*
10054 10062                   * Page exists in the cache, acquire the appropriate lock.
10055 10063                   * If this fails, start all over again.
10056 10064                   */
10057 10065                  if ((pp = page_lookup(vp, off, se)) == NULL) {
10058 10066  #ifdef DEBUG
10059 10067                          nfs4_lostpage++;
10060 10068  #endif
10061 10069                          goto reread;
10062 10070                  }
10063 10071                  pl[0] = pp;
10064 10072                  pl[1] = NULL;
10065 10073                  return (0);
10066 10074          }
10067 10075  
10068 10076          if (pp != NULL)
10069 10077                  pvn_plist_init(pp, pl, plsz, off, io_len, rw);
10070 10078  
10071 10079          return (error);
10072 10080  }
10073 10081  
10074 10082  static void
10075 10083  nfs4_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, struct seg *seg,
10076 10084      cred_t *cr)
10077 10085  {
10078 10086          int error;
10079 10087          page_t *pp;
10080 10088          u_offset_t io_off;
10081 10089          size_t io_len;
10082 10090          struct buf *bp;
10083 10091          uint_t bsize, blksize;
10084 10092          rnode4_t *rp = VTOR4(vp);
10085 10093          page_t *savepp;
10086 10094  
10087 10095          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
10088 10096  
10089 10097          bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
10090 10098  
10091 10099          mutex_enter(&rp->r_statelock);
10092 10100          if (blkoff < rp->r_size && blkoff + bsize > rp->r_size) {
10093 10101                  /*
10094 10102                   * If less than a block left in file read less
10095 10103                   * than a block.
10096 10104                   */
10097 10105                  blksize = rp->r_size - blkoff;
10098 10106          } else
10099 10107                  blksize = bsize;
10100 10108          mutex_exit(&rp->r_statelock);
10101 10109  
10102 10110          pp = pvn_read_kluster(vp, blkoff, segkmap, addr,
10103 10111              &io_off, &io_len, blkoff, blksize, 1);
10104 10112          /*
10105 10113           * The isra flag passed to the kluster function is 1, we may have
10106 10114           * gotten a return value of NULL for a variety of reasons (# of free
10107 10115           * pages < minfree, someone entered the page on the vnode etc). In all
10108 10116           * cases, we want to punt on the readahead.
10109 10117           */
10110 10118          if (pp == NULL)
10111 10119                  return;
10112 10120  
10113 10121          /*
10114 10122           * Now round the request size up to page boundaries.
10115 10123           * This ensures that the entire page will be
10116 10124           * initialized to zeroes if EOF is encountered.
10117 10125           */
10118 10126          io_len = ptob(btopr(io_len));
10119 10127  
10120 10128          bp = pageio_setup(pp, io_len, vp, B_READ);
10121 10129          ASSERT(bp != NULL);
10122 10130  
10123 10131          /*
10124 10132           * pageio_setup should have set b_addr to 0.  This is correct since
10125 10133           * we want to do I/O on a page boundary. bp_mapin() will use this addr
10126 10134           * to calculate an offset, and then set b_addr to the kernel virtual
10127 10135           * address it allocated for us.
10128 10136           */
10129 10137          ASSERT(bp->b_un.b_addr == 0);
10130 10138  
10131 10139          bp->b_edev = 0;
10132 10140          bp->b_dev = 0;
10133 10141          bp->b_lblkno = lbtodb(io_off);
10134 10142          bp->b_file = vp;
10135 10143          bp->b_offset = (offset_t)blkoff;
10136 10144          bp_mapin(bp);
10137 10145  
10138 10146          /*
10139 10147           * If doing a write beyond what we believe is EOF, don't bother trying
10140 10148           * to read the pages from the server, we'll just zero the pages here.
10141 10149           * We don't check that the rw flag is S_WRITE here because some
10142 10150           * implementations may attempt a read access to the buffer before
10143 10151           * copying data.
10144 10152           */
10145 10153          mutex_enter(&rp->r_statelock);
10146 10154          if (io_off >= rp->r_size && seg == segkmap) {
10147 10155                  mutex_exit(&rp->r_statelock);
10148 10156                  bzero(bp->b_un.b_addr, io_len);
10149 10157                  error = 0;
10150 10158          } else {
10151 10159                  mutex_exit(&rp->r_statelock);
10152 10160                  error = nfs4_bio(bp, NULL, cr, TRUE);
10153 10161                  if (error == NFS_EOF)
10154 10162                          error = 0;
10155 10163          }
10156 10164  
10157 10165          /*
10158 10166           * Unmap the buffer before freeing it.
10159 10167           */
10160 10168          bp_mapout(bp);
10161 10169          pageio_done(bp);
10162 10170  
10163 10171          savepp = pp;
10164 10172          do {
10165 10173                  pp->p_fsdata = C_NOCOMMIT;
10166 10174          } while ((pp = pp->p_next) != savepp);
10167 10175  
10168 10176          pvn_read_done(pp, error ? B_READ | B_ERROR : B_READ);
10169 10177  
10170 10178          /*
10171 10179           * In case of error set readahead offset
10172 10180           * to the lowest offset.
10173 10181           * pvn_read_done() calls VN_DISPOSE to destroy the pages
10174 10182           */
10175 10183          if (error && rp->r_nextr > io_off) {
10176 10184                  mutex_enter(&rp->r_statelock);
10177 10185                  if (rp->r_nextr > io_off)
10178 10186                          rp->r_nextr = io_off;
10179 10187                  mutex_exit(&rp->r_statelock);
10180 10188          }
10181 10189  }
10182 10190  
10183 10191  /*
10184 10192   * Flags are composed of {B_INVAL, B_FREE, B_DONTNEED, B_FORCE}
10185 10193   * If len == 0, do from off to EOF.
10186 10194   *
10187 10195   * The normal cases should be len == 0 && off == 0 (entire vp list) or
10188 10196   * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE
10189 10197   * (from pageout).
10190 10198   */
10191 10199  /* ARGSUSED */
10192 10200  static int
10193 10201  nfs4_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
10194 10202          caller_context_t *ct)
10195 10203  {
10196 10204          int error;
10197 10205          rnode4_t *rp;
10198 10206  
10199 10207          ASSERT(cr != NULL);
10200 10208  
10201 10209          if (!(flags & B_ASYNC) && nfs_zone() != VTOMI4(vp)->mi_zone)
10202 10210                  return (EIO);
10203 10211  
10204 10212          rp = VTOR4(vp);
10205 10213          if (IS_SHADOW(vp, rp))
10206 10214                  vp = RTOV4(rp);
10207 10215  
10208 10216          /*
10209 10217           * XXX - Why should this check be made here?
10210 10218           */
10211 10219          if (vp->v_flag & VNOMAP)
10212 10220                  return (ENOSYS);
10213 10221  
10214 10222          if (len == 0 && !(flags & B_INVAL) &&
10215 10223              (vp->v_vfsp->vfs_flag & VFS_RDONLY))
10216 10224                  return (0);
10217 10225  
10218 10226          mutex_enter(&rp->r_statelock);
10219 10227          rp->r_count++;
10220 10228          mutex_exit(&rp->r_statelock);
10221 10229          error = nfs4_putpages(vp, off, len, flags, cr);
10222 10230          mutex_enter(&rp->r_statelock);
10223 10231          rp->r_count--;
10224 10232          cv_broadcast(&rp->r_cv);
10225 10233          mutex_exit(&rp->r_statelock);
10226 10234  
10227 10235          return (error);
10228 10236  }
10229 10237  
10230 10238  /*
10231 10239   * Write out a single page, possibly klustering adjacent dirty pages.
10232 10240   */
10233 10241  int
10234 10242  nfs4_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp,
10235 10243      int flags, cred_t *cr)
10236 10244  {
10237 10245          u_offset_t io_off;
10238 10246          u_offset_t lbn_off;
10239 10247          u_offset_t lbn;
10240 10248          size_t io_len;
10241 10249          uint_t bsize;
10242 10250          int error;
10243 10251          rnode4_t *rp;
10244 10252  
10245 10253          ASSERT(!(vp->v_vfsp->vfs_flag & VFS_RDONLY));
10246 10254          ASSERT(pp != NULL);
10247 10255          ASSERT(cr != NULL);
10248 10256          ASSERT((flags & B_ASYNC) || nfs_zone() == VTOMI4(vp)->mi_zone);
10249 10257  
10250 10258          rp = VTOR4(vp);
10251 10259          ASSERT(rp->r_count > 0);
10252 10260          ASSERT(!IS_SHADOW(vp, rp));
10253 10261  
10254 10262          bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
10255 10263          lbn = pp->p_offset / bsize;
10256 10264          lbn_off = lbn * bsize;
10257 10265  
10258 10266          /*
10259 10267           * Find a kluster that fits in one block, or in
10260 10268           * one page if pages are bigger than blocks.  If
10261 10269           * there is less file space allocated than a whole
10262 10270           * page, we'll shorten the i/o request below.
10263 10271           */
10264 10272          pp = pvn_write_kluster(vp, pp, &io_off, &io_len, lbn_off,
10265 10273              roundup(bsize, PAGESIZE), flags);
10266 10274  
10267 10275          /*
10268 10276           * pvn_write_kluster shouldn't have returned a page with offset
10269 10277           * behind the original page we were given.  Verify that.
10270 10278           */
10271 10279          ASSERT((pp->p_offset / bsize) >= lbn);
10272 10280  
10273 10281          /*
10274 10282           * Now pp will have the list of kept dirty pages marked for
10275 10283           * write back.  It will also handle invalidation and freeing
10276 10284           * of pages that are not dirty.  Check for page length rounding
10277 10285           * problems.
10278 10286           */
10279 10287          if (io_off + io_len > lbn_off + bsize) {
10280 10288                  ASSERT((io_off + io_len) - (lbn_off + bsize) < PAGESIZE);
10281 10289                  io_len = lbn_off + bsize - io_off;
10282 10290          }
10283 10291          /*
10284 10292           * The R4MODINPROGRESS flag makes sure that nfs4_bio() sees a
10285 10293           * consistent value of r_size. R4MODINPROGRESS is set in writerp4().
10286 10294           * When R4MODINPROGRESS is set it indicates that a uiomove() is in
10287 10295           * progress and the r_size has not been made consistent with the
10288 10296           * new size of the file. When the uiomove() completes the r_size is
10289 10297           * updated and the R4MODINPROGRESS flag is cleared.
10290 10298           *
10291 10299           * The R4MODINPROGRESS flag makes sure that nfs4_bio() sees a
10292 10300           * consistent value of r_size. Without this handshaking, it is
10293 10301           * possible that nfs4_bio() picks  up the old value of r_size
10294 10302           * before the uiomove() in writerp4() completes. This will result
10295 10303           * in the write through nfs4_bio() being dropped.
10296 10304           *
10297 10305           * More precisely, there is a window between the time the uiomove()
10298 10306           * completes and the time the r_size is updated. If a VOP_PUTPAGE()
10299 10307           * operation intervenes in this window, the page will be picked up,
10300 10308           * because it is dirty (it will be unlocked, unless it was
10301 10309           * pagecreate'd). When the page is picked up as dirty, the dirty
10302 10310           * bit is reset (pvn_getdirty()). In nfs4write(), r_size is
10303 10311           * checked. This will still be the old size. Therefore the page will
10304 10312           * not be written out. When segmap_release() calls VOP_PUTPAGE(),
10305 10313           * the page will be found to be clean and the write will be dropped.
10306 10314           */
10307 10315          if (rp->r_flags & R4MODINPROGRESS) {
10308 10316                  mutex_enter(&rp->r_statelock);
10309 10317                  if ((rp->r_flags & R4MODINPROGRESS) &&
10310 10318                      rp->r_modaddr + MAXBSIZE > io_off &&
10311 10319                      rp->r_modaddr < io_off + io_len) {
10312 10320                          page_t *plist;
10313 10321                          /*
10314 10322                           * A write is in progress for this region of the file.
10315 10323                           * If we did not detect R4MODINPROGRESS here then this
10316 10324                           * path through nfs_putapage() would eventually go to
10317 10325                           * nfs4_bio() and may not write out all of the data
10318 10326                           * in the pages. We end up losing data. So we decide
10319 10327                           * to set the modified bit on each page in the page
10320 10328                           * list and mark the rnode with R4DIRTY. This write
10321 10329                           * will be restarted at some later time.
10322 10330                           */
10323 10331                          plist = pp;
10324 10332                          while (plist != NULL) {
10325 10333                                  pp = plist;
10326 10334                                  page_sub(&plist, pp);
10327 10335                                  hat_setmod(pp);
10328 10336                                  page_io_unlock(pp);
10329 10337                                  page_unlock(pp);
10330 10338                          }
10331 10339                          rp->r_flags |= R4DIRTY;
10332 10340                          mutex_exit(&rp->r_statelock);
10333 10341                          if (offp)
10334 10342                                  *offp = io_off;
10335 10343                          if (lenp)
10336 10344                                  *lenp = io_len;
10337 10345                          return (0);
10338 10346                  }
10339 10347                  mutex_exit(&rp->r_statelock);
10340 10348          }
10341 10349  
10342 10350          if (flags & B_ASYNC) {
10343 10351                  error = nfs4_async_putapage(vp, pp, io_off, io_len, flags, cr,
10344 10352                      nfs4_sync_putapage);
10345 10353          } else
10346 10354                  error = nfs4_sync_putapage(vp, pp, io_off, io_len, flags, cr);
10347 10355  
10348 10356          if (offp)
10349 10357                  *offp = io_off;
10350 10358          if (lenp)
10351 10359                  *lenp = io_len;
10352 10360          return (error);
10353 10361  }
10354 10362  
10355 10363  static int
10356 10364  nfs4_sync_putapage(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
10357 10365      int flags, cred_t *cr)
10358 10366  {
10359 10367          int error;
10360 10368          rnode4_t *rp;
10361 10369  
10362 10370          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
10363 10371  
10364 10372          flags |= B_WRITE;
10365 10373  
10366 10374          error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
10367 10375  
10368 10376          rp = VTOR4(vp);
10369 10377  
10370 10378          if ((error == ENOSPC || error == EDQUOT || error == EFBIG ||
10371 10379              error == EACCES) &&
10372 10380              (flags & (B_INVAL|B_FORCE)) != (B_INVAL|B_FORCE)) {
10373 10381                  if (!(rp->r_flags & R4OUTOFSPACE)) {
10374 10382                          mutex_enter(&rp->r_statelock);
10375 10383                          rp->r_flags |= R4OUTOFSPACE;
10376 10384                          mutex_exit(&rp->r_statelock);
10377 10385                  }
10378 10386                  flags |= B_ERROR;
10379 10387                  pvn_write_done(pp, flags);
10380 10388                  /*
10381 10389                   * If this was not an async thread, then try again to
10382 10390                   * write out the pages, but this time, also destroy
10383 10391                   * them whether or not the write is successful.  This
10384 10392                   * will prevent memory from filling up with these
10385 10393                   * pages and destroying them is the only alternative
10386 10394                   * if they can't be written out.
10387 10395                   *
10388 10396                   * Don't do this if this is an async thread because
10389 10397                   * when the pages are unlocked in pvn_write_done,
10390 10398                   * some other thread could have come along, locked
10391 10399                   * them, and queued for an async thread.  It would be
10392 10400                   * possible for all of the async threads to be tied
10393 10401                   * up waiting to lock the pages again and they would
10394 10402                   * all already be locked and waiting for an async
10395 10403                   * thread to handle them.  Deadlock.
10396 10404                   */
10397 10405                  if (!(flags & B_ASYNC)) {
10398 10406                          error = nfs4_putpage(vp, io_off, io_len,
10399 10407                              B_INVAL | B_FORCE, cr, NULL);
10400 10408                  }
10401 10409          } else {
10402 10410                  if (error)
10403 10411                          flags |= B_ERROR;
10404 10412                  else if (rp->r_flags & R4OUTOFSPACE) {
10405 10413                          mutex_enter(&rp->r_statelock);
10406 10414                          rp->r_flags &= ~R4OUTOFSPACE;
10407 10415                          mutex_exit(&rp->r_statelock);
10408 10416                  }
10409 10417                  pvn_write_done(pp, flags);
10410 10418                  if (freemem < desfree)
10411 10419                          (void) nfs4_commit_vp(vp, (u_offset_t)0, 0, cr,
10412 10420                              NFS4_WRITE_NOWAIT);
10413 10421          }
10414 10422  
10415 10423          return (error);
10416 10424  }
10417 10425  
10418 10426  #ifdef DEBUG
10419 10427  int nfs4_force_open_before_mmap = 0;
10420 10428  #endif
10421 10429  
10422 10430  /* ARGSUSED */
10423 10431  static int
10424 10432  nfs4_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
10425 10433      size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
10426 10434      caller_context_t *ct)
10427 10435  {
10428 10436          struct segvn_crargs vn_a;
10429 10437          int error = 0;
10430 10438          rnode4_t *rp = VTOR4(vp);
10431 10439          mntinfo4_t *mi = VTOMI4(vp);
10432 10440  
10433 10441          if (nfs_zone() != VTOMI4(vp)->mi_zone)
10434 10442                  return (EIO);
10435 10443  
10436 10444          if (vp->v_flag & VNOMAP)
10437 10445                  return (ENOSYS);
10438 10446  
10439 10447          if (off < 0 || (off + len) < 0)
10440 10448                  return (ENXIO);
10441 10449  
10442 10450          if (vp->v_type != VREG)
10443 10451                  return (ENODEV);
10444 10452  
10445 10453          /*
10446 10454           * If the file is delegated to the client don't do anything.
10447 10455           * If the file is not delegated, then validate the data cache.
10448 10456           */
10449 10457          mutex_enter(&rp->r_statev4_lock);
10450 10458          if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
10451 10459                  mutex_exit(&rp->r_statev4_lock);
10452 10460                  error = nfs4_validate_caches(vp, cr);
10453 10461                  if (error)
10454 10462                          return (error);
10455 10463          } else {
10456 10464                  mutex_exit(&rp->r_statev4_lock);
10457 10465          }
10458 10466  
10459 10467          /*
10460 10468           * Check to see if the vnode is currently marked as not cachable.
10461 10469           * This means portions of the file are locked (through VOP_FRLOCK).
10462 10470           * In this case the map request must be refused.  We use
10463 10471           * rp->r_lkserlock to avoid a race with concurrent lock requests.
10464 10472           *
10465 10473           * Atomically increment r_inmap after acquiring r_rwlock. The
10466 10474           * idea here is to acquire r_rwlock to block read/write and
10467 10475           * not to protect r_inmap. r_inmap will inform nfs4_read/write()
10468 10476           * that we are in nfs4_map(). Now, r_rwlock is acquired in order
10469 10477           * and we can prevent the deadlock that would have occurred
10470 10478           * when nfs4_addmap() would have acquired it out of order.
10471 10479           *
10472 10480           * Since we are not protecting r_inmap by any lock, we do not
10473 10481           * hold any lock when we decrement it. We atomically decrement
10474 10482           * r_inmap after we release r_lkserlock.
10475 10483           */
10476 10484  
10477 10485          if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR4(vp)))
10478 10486                  return (EINTR);
10479 10487          atomic_add_int(&rp->r_inmap, 1);
10480 10488          nfs_rw_exit(&rp->r_rwlock);
10481 10489  
10482 10490          if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR4(vp))) {
10483 10491                  atomic_add_int(&rp->r_inmap, -1);
10484 10492                  return (EINTR);
10485 10493          }
10486 10494  
10487 10495  
10488 10496          if (vp->v_flag & VNOCACHE) {
10489 10497                  error = EAGAIN;
10490 10498                  goto done;
10491 10499          }
10492 10500  
10493 10501          /*
10494 10502           * Don't allow concurrent locks and mapping if mandatory locking is
10495 10503           * enabled.
10496 10504           */
10497 10505          if (flk_has_remote_locks(vp)) {
10498 10506                  struct vattr va;
10499 10507                  va.va_mask = AT_MODE;
10500 10508                  error = nfs4getattr(vp, &va, cr);
10501 10509                  if (error != 0)
10502 10510                          goto done;
10503 10511                  if (MANDLOCK(vp, va.va_mode)) {
10504 10512                          error = EAGAIN;
10505 10513                          goto done;
10506 10514                  }
10507 10515          }
10508 10516  
10509 10517          /*
10510 10518           * It is possible that the rnode has a lost lock request that we
10511 10519           * are still trying to recover, and that the request conflicts with
10512 10520           * this map request.
10513 10521           *
10514 10522           * An alternative approach would be for nfs4_safemap() to consider
10515 10523           * queued lock requests when deciding whether to set or clear
10516 10524           * VNOCACHE.  This would require the frlock code path to call
10517 10525           * nfs4_safemap() after enqueing a lost request.
10518 10526           */
10519 10527          if (nfs4_map_lost_lock_conflict(vp)) {
10520 10528                  error = EAGAIN;
10521 10529                  goto done;
10522 10530          }
10523 10531  
10524 10532          as_rangelock(as);
10525 10533          error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
10526 10534          if (error != 0) {
10527 10535                  as_rangeunlock(as);
10528 10536                  goto done;
10529 10537          }
10530 10538  
10531 10539          if (vp->v_type == VREG) {
10532 10540                  /*
10533 10541                   * We need to retrieve the open stream
10534 10542                   */
10535 10543                  nfs4_open_stream_t      *osp = NULL;
10536 10544                  nfs4_open_owner_t       *oop = NULL;
10537 10545  
10538 10546                  oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
10539 10547                  if (oop != NULL) {
10540 10548                          /* returns with 'os_sync_lock' held */
10541 10549                          osp = find_open_stream(oop, rp);
10542 10550                          open_owner_rele(oop);
10543 10551                  }
10544 10552                  if (osp == NULL) {
10545 10553  #ifdef DEBUG
10546 10554                          if (nfs4_force_open_before_mmap) {
10547 10555                                  error = EIO;
10548 10556                                  goto done;
10549 10557                          }
10550 10558  #endif
10551 10559                          /* returns with 'os_sync_lock' held */
10552 10560                          error = open_and_get_osp(vp, cr, &osp);
10553 10561                          if (osp == NULL) {
10554 10562                                  NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE,
10555 10563                                      "nfs4_map: we tried to OPEN the file "
10556 10564                                      "but again no osp, so fail with EIO"));
10557 10565                                  goto done;
10558 10566                          }
10559 10567                  }
10560 10568  
10561 10569                  if (osp->os_failed_reopen) {
10562 10570                          mutex_exit(&osp->os_sync_lock);
10563 10571                          open_stream_rele(osp, rp);
10564 10572                          NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE,
10565 10573                              "nfs4_map: os_failed_reopen set on "
10566 10574                              "osp %p, cr %p, rp %s", (void *)osp,
10567 10575                              (void *)cr, rnode4info(rp)));
10568 10576                          error = EIO;
10569 10577                          goto done;
10570 10578                  }
10571 10579                  mutex_exit(&osp->os_sync_lock);
10572 10580                  open_stream_rele(osp, rp);
10573 10581          }
10574 10582  
10575 10583          vn_a.vp = vp;
10576 10584          vn_a.offset = off;
10577 10585          vn_a.type = (flags & MAP_TYPE);
10578 10586          vn_a.prot = (uchar_t)prot;
10579 10587          vn_a.maxprot = (uchar_t)maxprot;
10580 10588          vn_a.flags = (flags & ~MAP_TYPE);
10581 10589          vn_a.cred = cr;
10582 10590          vn_a.amp = NULL;
10583 10591          vn_a.szc = 0;
10584 10592          vn_a.lgrp_mem_policy_flags = 0;
10585 10593  
10586 10594          error = as_map(as, *addrp, len, segvn_create, &vn_a);
10587 10595          as_rangeunlock(as);
10588 10596  
10589 10597  done:
10590 10598          nfs_rw_exit(&rp->r_lkserlock);
10591 10599          atomic_add_int(&rp->r_inmap, -1);
10592 10600          return (error);
10593 10601  }
10594 10602  
10595 10603  /*
10596 10604   * We're most likely dealing with a kernel module that likes to READ
10597 10605   * and mmap without OPENing the file (ie: lookup/read/mmap), so lets
10598 10606   * officially OPEN the file to create the necessary client state
10599 10607   * for bookkeeping of os_mmap_read/write counts.
10600 10608   *
10601 10609   * Since VOP_MAP only passes in a pointer to the vnode rather than
10602 10610   * a double pointer, we can't handle the case where nfs4open_otw()
10603 10611   * returns a different vnode than the one passed into VOP_MAP (since
10604 10612   * VOP_DELMAP will not see the vnode nfs4open_otw used).  In this case,
10605 10613   * we return NULL and let nfs4_map() fail.  Note: the only case where
10606 10614   * this should happen is if the file got removed and replaced with the
10607 10615   * same name on the server (in addition to the fact that we're trying
10608 10616   * to VOP_MAP withouth VOP_OPENing the file in the first place).
10609 10617   */
10610 10618  static int
10611 10619  open_and_get_osp(vnode_t *map_vp, cred_t *cr, nfs4_open_stream_t **ospp)
10612 10620  {
10613 10621          rnode4_t                *rp, *drp;
10614 10622          vnode_t                 *dvp, *open_vp;
10615 10623          char                    file_name[MAXNAMELEN];
10616 10624          int                     just_created;
10617 10625          nfs4_open_stream_t      *osp;
10618 10626          nfs4_open_owner_t       *oop;
10619 10627          int                     error;
10620 10628  
10621 10629          *ospp = NULL;
10622 10630          open_vp = map_vp;
10623 10631  
10624 10632          rp = VTOR4(open_vp);
10625 10633          if ((error = vtodv(open_vp, &dvp, cr, TRUE)) != 0)
10626 10634                  return (error);
10627 10635          drp = VTOR4(dvp);
10628 10636  
10629 10637          if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) {
10630 10638                  VN_RELE(dvp);
10631 10639                  return (EINTR);
10632 10640          }
10633 10641  
10634 10642          if ((error = vtoname(open_vp, file_name, MAXNAMELEN)) != 0) {
10635 10643                  nfs_rw_exit(&drp->r_rwlock);
10636 10644                  VN_RELE(dvp);
10637 10645                  return (error);
10638 10646          }
10639 10647  
10640 10648          mutex_enter(&rp->r_statev4_lock);
10641 10649          if (rp->created_v4) {
10642 10650                  rp->created_v4 = 0;
10643 10651                  mutex_exit(&rp->r_statev4_lock);
10644 10652  
10645 10653                  dnlc_update(dvp, file_name, open_vp);
10646 10654                  /* This is needed so we don't bump the open ref count */
10647 10655                  just_created = 1;
10648 10656          } else {
10649 10657                  mutex_exit(&rp->r_statev4_lock);
10650 10658                  just_created = 0;
10651 10659          }
10652 10660  
10653 10661          VN_HOLD(map_vp);
10654 10662  
10655 10663          error = nfs4open_otw(dvp, file_name, NULL, &open_vp, cr, 0, FREAD, 0,
10656 10664              just_created);
10657 10665          if (error) {
10658 10666                  nfs_rw_exit(&drp->r_rwlock);
10659 10667                  VN_RELE(dvp);
10660 10668                  VN_RELE(map_vp);
10661 10669                  return (error);
10662 10670          }
10663 10671  
10664 10672          nfs_rw_exit(&drp->r_rwlock);
10665 10673          VN_RELE(dvp);
10666 10674  
10667 10675          /*
10668 10676           * If nfs4open_otw() returned a different vnode then "undo"
10669 10677           * the open and return failure to the caller.
10670 10678           */
10671 10679          if (!VN_CMP(open_vp, map_vp)) {
10672 10680                  nfs4_error_t e;
10673 10681  
10674 10682                  NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "open_and_get_osp: "
10675 10683                      "open returned a different vnode"));
10676 10684                  /*
10677 10685                   * If there's an error, ignore it,
10678 10686                   * and let VOP_INACTIVE handle it.
10679 10687                   */
10680 10688                  (void) nfs4close_one(open_vp, NULL, cr, FREAD, NULL, &e,
10681 10689                      CLOSE_NORM, 0, 0, 0);
10682 10690                  VN_RELE(map_vp);
10683 10691                  return (EIO);
10684 10692          }
10685 10693  
10686 10694          VN_RELE(map_vp);
10687 10695  
10688 10696          oop = find_open_owner(cr, NFS4_PERM_CREATED, VTOMI4(open_vp));
10689 10697          if (!oop) {
10690 10698                  nfs4_error_t e;
10691 10699  
10692 10700                  NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "open_and_get_osp: "
10693 10701                      "no open owner"));
10694 10702                  /*
10695 10703                   * If there's an error, ignore it,
10696 10704                   * and let VOP_INACTIVE handle it.
10697 10705                   */
10698 10706                  (void) nfs4close_one(open_vp, NULL, cr, FREAD, NULL, &e,
10699 10707                      CLOSE_NORM, 0, 0, 0);
10700 10708                  return (EIO);
10701 10709          }
10702 10710          osp = find_open_stream(oop, rp);
10703 10711          open_owner_rele(oop);
10704 10712          *ospp = osp;
10705 10713          return (0);
10706 10714  }
10707 10715  
10708 10716  /*
10709 10717   * Please be aware that when this function is called, the address space write
10710 10718   * a_lock is held.  Do not put over the wire calls in this function.
10711 10719   */
10712 10720  /* ARGSUSED */
10713 10721  static int
10714 10722  nfs4_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
10715 10723      size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
10716 10724      caller_context_t *ct)
10717 10725  {
10718 10726          rnode4_t                *rp;
10719 10727          int                     error = 0;
10720 10728          mntinfo4_t              *mi;
10721 10729  
10722 10730          mi = VTOMI4(vp);
10723 10731          rp = VTOR4(vp);
10724 10732  
10725 10733          if (nfs_zone() != mi->mi_zone)
10726 10734                  return (EIO);
10727 10735          if (vp->v_flag & VNOMAP)
10728 10736                  return (ENOSYS);
10729 10737  
10730 10738          /*
10731 10739           * Don't need to update the open stream first, since this
10732 10740           * mmap can't add any additional share access that isn't
10733 10741           * already contained in the open stream (for the case where we
10734 10742           * open/mmap/only update rp->r_mapcnt/server reboots/reopen doesn't
10735 10743           * take into account os_mmap_read[write] counts).
10736 10744           */
10737 10745          atomic_add_long((ulong_t *)&rp->r_mapcnt, btopr(len));
10738 10746  
10739 10747          if (vp->v_type == VREG) {
10740 10748                  /*
10741 10749                   * We need to retrieve the open stream and update the counts.
10742 10750                   * If there is no open stream here, something is wrong.
10743 10751                   */
10744 10752                  nfs4_open_stream_t      *osp = NULL;
10745 10753                  nfs4_open_owner_t       *oop = NULL;
10746 10754  
10747 10755                  oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
10748 10756                  if (oop != NULL) {
10749 10757                          /* returns with 'os_sync_lock' held */
10750 10758                          osp = find_open_stream(oop, rp);
10751 10759                          open_owner_rele(oop);
10752 10760                  }
10753 10761                  if (osp == NULL) {
10754 10762                          NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE,
10755 10763                              "nfs4_addmap: we should have an osp"
10756 10764                              "but we don't, so fail with EIO"));
10757 10765                          error = EIO;
10758 10766                          goto out;
10759 10767                  }
10760 10768  
10761 10769                  NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "nfs4_addmap: osp %p,"
10762 10770                      " pages %ld, prot 0x%x", (void *)osp, btopr(len), prot));
10763 10771  
10764 10772                  /*
10765 10773                   * Update the map count in the open stream.
10766 10774                   * This is necessary in the case where we
10767 10775                   * open/mmap/close/, then the server reboots, and we
10768 10776                   * attempt to reopen.  If the mmap doesn't add share
10769 10777                   * access then we send an invalid reopen with
10770 10778                   * access = NONE.
10771 10779                   *
10772 10780                   * We need to specifically check each PROT_* so a mmap
10773 10781                   * call of (PROT_WRITE | PROT_EXEC) will ensure us both
10774 10782                   * read and write access.  A simple comparison of prot
10775 10783                   * to ~PROT_WRITE to determine read access is insufficient
10776 10784                   * since prot can be |= with PROT_USER, etc.
10777 10785                   */
10778 10786  
10779 10787                  /*
10780 10788                   * Unless we're MAP_SHARED, no sense in adding os_mmap_write
10781 10789                   */
10782 10790                  if ((flags & MAP_SHARED) && (maxprot & PROT_WRITE))
10783 10791                          osp->os_mmap_write += btopr(len);
10784 10792                  if (maxprot & PROT_READ)
10785 10793                          osp->os_mmap_read += btopr(len);
10786 10794                  if (maxprot & PROT_EXEC)
10787 10795                          osp->os_mmap_read += btopr(len);
10788 10796                  /*
10789 10797                   * Ensure that os_mmap_read gets incremented, even if
10790 10798                   * maxprot were to look like PROT_NONE.
10791 10799                   */
10792 10800                  if (!(maxprot & PROT_READ) && !(maxprot & PROT_WRITE) &&
10793 10801                      !(maxprot & PROT_EXEC))
10794 10802                          osp->os_mmap_read += btopr(len);
10795 10803                  osp->os_mapcnt += btopr(len);
10796 10804                  mutex_exit(&osp->os_sync_lock);
10797 10805                  open_stream_rele(osp, rp);
10798 10806          }
10799 10807  
10800 10808  out:
10801 10809          /*
10802 10810           * If we got an error, then undo our
10803 10811           * incrementing of 'r_mapcnt'.
10804 10812           */
10805 10813  
10806 10814          if (error) {
10807 10815                  atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(len));
10808 10816                  ASSERT(rp->r_mapcnt >= 0);
10809 10817          }
10810 10818          return (error);
10811 10819  }
10812 10820  
10813 10821  /* ARGSUSED */
10814 10822  static int
10815 10823  nfs4_cmp(vnode_t *vp1, vnode_t *vp2, caller_context_t *ct)
10816 10824  {
10817 10825  
10818 10826          return (VTOR4(vp1) == VTOR4(vp2));
10819 10827  }
10820 10828  
10821 10829  /* ARGSUSED */
10822 10830  static int
10823 10831  nfs4_frlock(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
10824 10832      offset_t offset, struct flk_callback *flk_cbp, cred_t *cr,
10825 10833      caller_context_t *ct)
10826 10834  {
10827 10835          int rc;
10828 10836          u_offset_t start, end;
10829 10837          rnode4_t *rp;
10830 10838          int error = 0, intr = INTR4(vp);
10831 10839          nfs4_error_t e;
10832 10840  
10833 10841          if (nfs_zone() != VTOMI4(vp)->mi_zone)
10834 10842                  return (EIO);
10835 10843  
10836 10844          /* check for valid cmd parameter */
10837 10845          if (cmd != F_GETLK && cmd != F_SETLK && cmd != F_SETLKW)
10838 10846                  return (EINVAL);
10839 10847  
10840 10848          /* Verify l_type. */
10841 10849          switch (bfp->l_type) {
10842 10850          case F_RDLCK:
10843 10851                  if (cmd != F_GETLK && !(flag & FREAD))
10844 10852                          return (EBADF);
10845 10853                  break;
10846 10854          case F_WRLCK:
10847 10855                  if (cmd != F_GETLK && !(flag & FWRITE))
10848 10856                          return (EBADF);
10849 10857                  break;
10850 10858          case F_UNLCK:
10851 10859                  intr = 0;
10852 10860                  break;
10853 10861  
10854 10862          default:
10855 10863                  return (EINVAL);
10856 10864          }
10857 10865  
10858 10866          /* check the validity of the lock range */
10859 10867          if (rc = flk_convert_lock_data(vp, bfp, &start, &end, offset))
10860 10868                  return (rc);
10861 10869          if (rc = flk_check_lock_data(start, end, MAXEND))
10862 10870                  return (rc);
10863 10871  
10864 10872          /*
10865 10873           * If the filesystem is mounted using local locking, pass the
10866 10874           * request off to the local locking code.
10867 10875           */
10868 10876          if (VTOMI4(vp)->mi_flags & MI4_LLOCK || vp->v_type != VREG) {
10869 10877                  if (cmd == F_SETLK || cmd == F_SETLKW) {
10870 10878                          /*
10871 10879                           * For complete safety, we should be holding
10872 10880                           * r_lkserlock.  However, we can't call
10873 10881                           * nfs4_safelock and then fs_frlock while
10874 10882                           * holding r_lkserlock, so just invoke
10875 10883                           * nfs4_safelock and expect that this will
10876 10884                           * catch enough of the cases.
10877 10885                           */
10878 10886                          if (!nfs4_safelock(vp, bfp, cr))
10879 10887                                  return (EAGAIN);
10880 10888                  }
10881 10889                  return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
10882 10890          }
10883 10891  
10884 10892          rp = VTOR4(vp);
10885 10893  
10886 10894          /*
10887 10895           * Check whether the given lock request can proceed, given the
10888 10896           * current file mappings.
10889 10897           */
10890 10898          if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_WRITER, intr))
10891 10899                  return (EINTR);
10892 10900          if (cmd == F_SETLK || cmd == F_SETLKW) {
10893 10901                  if (!nfs4_safelock(vp, bfp, cr)) {
10894 10902                          rc = EAGAIN;
10895 10903                          goto done;
10896 10904                  }
10897 10905          }
10898 10906  
10899 10907          /*
10900 10908           * Flush the cache after waiting for async I/O to finish.  For new
10901 10909           * locks, this is so that the process gets the latest bits from the
10902 10910           * server.  For unlocks, this is so that other clients see the
10903 10911           * latest bits once the file has been unlocked.  If currently dirty
10904 10912           * pages can't be flushed, then don't allow a lock to be set.  But
10905 10913           * allow unlocks to succeed, to avoid having orphan locks on the
10906 10914           * server.
10907 10915           */
10908 10916          if (cmd != F_GETLK) {
10909 10917                  mutex_enter(&rp->r_statelock);
10910 10918                  while (rp->r_count > 0) {
10911 10919                          if (intr) {
10912 10920                                  klwp_t *lwp = ttolwp(curthread);
10913 10921  
10914 10922                                  if (lwp != NULL)
10915 10923                                          lwp->lwp_nostop++;
10916 10924                                  if (cv_wait_sig(&rp->r_cv,
10917 10925                                      &rp->r_statelock) == 0) {
10918 10926                                          if (lwp != NULL)
10919 10927                                                  lwp->lwp_nostop--;
10920 10928                                          rc = EINTR;
10921 10929                                          break;
10922 10930                                  }
10923 10931                                  if (lwp != NULL)
10924 10932                                          lwp->lwp_nostop--;
10925 10933                                  } else
10926 10934                                          cv_wait(&rp->r_cv, &rp->r_statelock);
10927 10935                  }
10928 10936                  mutex_exit(&rp->r_statelock);
10929 10937                  if (rc != 0)
10930 10938                          goto done;
10931 10939                  error = nfs4_putpage(vp, (offset_t)0, 0, B_INVAL, cr, ct);
10932 10940                  if (error) {
10933 10941                          if (error == ENOSPC || error == EDQUOT) {
10934 10942                                  mutex_enter(&rp->r_statelock);
10935 10943                                  if (!rp->r_error)
10936 10944                                          rp->r_error = error;
10937 10945                                  mutex_exit(&rp->r_statelock);
10938 10946                          }
10939 10947                          if (bfp->l_type != F_UNLCK) {
10940 10948                                  rc = ENOLCK;
10941 10949                                  goto done;
10942 10950                          }
10943 10951                  }
10944 10952          }
10945 10953  
10946 10954          /*
10947 10955           * Call the lock manager to do the real work of contacting
10948 10956           * the server and obtaining the lock.
10949 10957           */
10950 10958          nfs4frlock(NFS4_LCK_CTYPE_NORM, vp, cmd, bfp, flag, offset,
10951 10959              cr, &e, NULL, NULL);
10952 10960          rc = e.error;
10953 10961  
10954 10962          if (rc == 0)
10955 10963                  nfs4_lockcompletion(vp, cmd);
10956 10964  
10957 10965  done:
10958 10966          nfs_rw_exit(&rp->r_lkserlock);
10959 10967  
10960 10968          return (rc);
10961 10969  }
10962 10970  
10963 10971  /*
10964 10972   * Free storage space associated with the specified vnode.  The portion
10965 10973   * to be freed is specified by bfp->l_start and bfp->l_len (already
10966 10974   * normalized to a "whence" of 0).
10967 10975   *
10968 10976   * This is an experimental facility whose continued existence is not
10969 10977   * guaranteed.  Currently, we only support the special case
10970 10978   * of l_len == 0, meaning free to end of file.
10971 10979   */
10972 10980  /* ARGSUSED */
10973 10981  static int
10974 10982  nfs4_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
10975 10983      offset_t offset, cred_t *cr, caller_context_t *ct)
10976 10984  {
10977 10985          int error;
10978 10986  
10979 10987          if (nfs_zone() != VTOMI4(vp)->mi_zone)
10980 10988                  return (EIO);
10981 10989          ASSERT(vp->v_type == VREG);
10982 10990          if (cmd != F_FREESP)
10983 10991                  return (EINVAL);
10984 10992  
10985 10993          error = convoff(vp, bfp, 0, offset);
10986 10994          if (!error) {
10987 10995                  ASSERT(bfp->l_start >= 0);
10988 10996                  if (bfp->l_len == 0) {
10989 10997                          struct vattr va;
10990 10998  
10991 10999                          va.va_mask = AT_SIZE;
10992 11000                          va.va_size = bfp->l_start;
10993 11001                          error = nfs4setattr(vp, &va, 0, cr, NULL);
10994 11002                  } else
10995 11003                          error = EINVAL;
10996 11004          }
10997 11005  
10998 11006          return (error);
10999 11007  }
11000 11008  
11001 11009  /* ARGSUSED */
11002 11010  int
11003 11011  nfs4_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct)
11004 11012  {
11005 11013          rnode4_t *rp;
11006 11014          rp = VTOR4(vp);
11007 11015  
11008 11016          if (vp->v_type == VREG && IS_SHADOW(vp, rp)) {
11009 11017                  vp = RTOV4(rp);
11010 11018          }
11011 11019          *vpp = vp;
11012 11020          return (0);
11013 11021  }
11014 11022  
11015 11023  /*
11016 11024   * Setup and add an address space callback to do the work of the delmap call.
11017 11025   * The callback will (and must be) deleted in the actual callback function.
11018 11026   *
11019 11027   * This is done in order to take care of the problem that we have with holding
11020 11028   * the address space's a_lock for a long period of time (e.g. if the NFS server
11021 11029   * is down).  Callbacks will be executed in the address space code while the
11022 11030   * a_lock is not held.  Holding the address space's a_lock causes things such
11023 11031   * as ps and fork to hang because they are trying to acquire this lock as well.
11024 11032   */
11025 11033  /* ARGSUSED */
11026 11034  static int
11027 11035  nfs4_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
11028 11036      size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr,
11029 11037      caller_context_t *ct)
11030 11038  {
11031 11039          int                     caller_found;
11032 11040          int                     error;
11033 11041          rnode4_t                *rp;
11034 11042          nfs4_delmap_args_t      *dmapp;
11035 11043          nfs4_delmapcall_t       *delmap_call;
11036 11044  
11037 11045          if (vp->v_flag & VNOMAP)
11038 11046                  return (ENOSYS);
11039 11047  
11040 11048          /*
11041 11049           * A process may not change zones if it has NFS pages mmap'ed
11042 11050           * in, so we can't legitimately get here from the wrong zone.
11043 11051           */
11044 11052          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
11045 11053  
11046 11054          rp = VTOR4(vp);
11047 11055  
11048 11056          /*
11049 11057           * The way that the address space of this process deletes its mapping
11050 11058           * of this file is via the following call chains:
11051 11059           * - as_free()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs4_delmap()
11052 11060           * - as_unmap()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs4_delmap()
11053 11061           *
11054 11062           * With the use of address space callbacks we are allowed to drop the
11055 11063           * address space lock, a_lock, while executing the NFS operations that
11056 11064           * need to go over the wire.  Returning EAGAIN to the caller of this
11057 11065           * function is what drives the execution of the callback that we add
11058 11066           * below.  The callback will be executed by the address space code
11059 11067           * after dropping the a_lock.  When the callback is finished, since
11060 11068           * we dropped the a_lock, it must be re-acquired and segvn_unmap()
11061 11069           * is called again on the same segment to finish the rest of the work
11062 11070           * that needs to happen during unmapping.
11063 11071           *
11064 11072           * This action of calling back into the segment driver causes
11065 11073           * nfs4_delmap() to get called again, but since the callback was
11066 11074           * already executed at this point, it already did the work and there
11067 11075           * is nothing left for us to do.
11068 11076           *
11069 11077           * To Summarize:
11070 11078           * - The first time nfs4_delmap is called by the current thread is when
11071 11079           * we add the caller associated with this delmap to the delmap caller
11072 11080           * list, add the callback, and return EAGAIN.
11073 11081           * - The second time in this call chain when nfs4_delmap is called we
11074 11082           * will find this caller in the delmap caller list and realize there
11075 11083           * is no more work to do thus removing this caller from the list and
11076 11084           * returning the error that was set in the callback execution.
11077 11085           */
11078 11086          caller_found = nfs4_find_and_delete_delmapcall(rp, &error);
11079 11087          if (caller_found) {
11080 11088                  /*
11081 11089                   * 'error' is from the actual delmap operations.  To avoid
11082 11090                   * hangs, we need to handle the return of EAGAIN differently
11083 11091                   * since this is what drives the callback execution.
11084 11092                   * In this case, we don't want to return EAGAIN and do the
11085 11093                   * callback execution because there are none to execute.
11086 11094                   */
11087 11095                  if (error == EAGAIN)
11088 11096                          return (0);
11089 11097                  else
11090 11098                          return (error);
11091 11099          }
11092 11100  
11093 11101          /* current caller was not in the list */
11094 11102          delmap_call = nfs4_init_delmapcall();
11095 11103  
11096 11104          mutex_enter(&rp->r_statelock);
11097 11105          list_insert_tail(&rp->r_indelmap, delmap_call);
11098 11106          mutex_exit(&rp->r_statelock);
11099 11107  
11100 11108          dmapp = kmem_alloc(sizeof (nfs4_delmap_args_t), KM_SLEEP);
11101 11109  
11102 11110          dmapp->vp = vp;
11103 11111          dmapp->off = off;
11104 11112          dmapp->addr = addr;
11105 11113          dmapp->len = len;
11106 11114          dmapp->prot = prot;
11107 11115          dmapp->maxprot = maxprot;
11108 11116          dmapp->flags = flags;
11109 11117          dmapp->cr = cr;
11110 11118          dmapp->caller = delmap_call;
11111 11119  
11112 11120          error = as_add_callback(as, nfs4_delmap_callback, dmapp,
11113 11121              AS_UNMAP_EVENT, addr, len, KM_SLEEP);
11114 11122  
11115 11123          return (error ? error : EAGAIN);
11116 11124  }
11117 11125  
11118 11126  static nfs4_delmapcall_t *
11119 11127  nfs4_init_delmapcall()
11120 11128  {
11121 11129          nfs4_delmapcall_t       *delmap_call;
11122 11130  
11123 11131          delmap_call = kmem_alloc(sizeof (nfs4_delmapcall_t), KM_SLEEP);
11124 11132          delmap_call->call_id = curthread;
11125 11133          delmap_call->error = 0;
11126 11134  
11127 11135          return (delmap_call);
11128 11136  }
11129 11137  
11130 11138  static void
11131 11139  nfs4_free_delmapcall(nfs4_delmapcall_t *delmap_call)
11132 11140  {
11133 11141          kmem_free(delmap_call, sizeof (nfs4_delmapcall_t));
11134 11142  }
11135 11143  
11136 11144  /*
11137 11145   * Searches for the current delmap caller (based on curthread) in the list of
11138 11146   * callers.  If it is found, we remove it and free the delmap caller.
11139 11147   * Returns:
11140 11148   *      0 if the caller wasn't found
11141 11149   *      1 if the caller was found, removed and freed.  *errp will be set
11142 11150   *      to what the result of the delmap was.
11143 11151   */
11144 11152  static int
11145 11153  nfs4_find_and_delete_delmapcall(rnode4_t *rp, int *errp)
11146 11154  {
11147 11155          nfs4_delmapcall_t       *delmap_call;
11148 11156  
11149 11157          /*
11150 11158           * If the list doesn't exist yet, we create it and return
11151 11159           * that the caller wasn't found.  No list = no callers.
11152 11160           */
11153 11161          mutex_enter(&rp->r_statelock);
11154 11162          if (!(rp->r_flags & R4DELMAPLIST)) {
11155 11163                  /* The list does not exist */
11156 11164                  list_create(&rp->r_indelmap, sizeof (nfs4_delmapcall_t),
11157 11165                      offsetof(nfs4_delmapcall_t, call_node));
11158 11166                  rp->r_flags |= R4DELMAPLIST;
11159 11167                  mutex_exit(&rp->r_statelock);
11160 11168                  return (0);
11161 11169          } else {
11162 11170                  /* The list exists so search it */
11163 11171                  for (delmap_call = list_head(&rp->r_indelmap);
11164 11172                      delmap_call != NULL;
11165 11173                      delmap_call = list_next(&rp->r_indelmap, delmap_call)) {
11166 11174                          if (delmap_call->call_id == curthread) {
11167 11175                                  /* current caller is in the list */
11168 11176                                  *errp = delmap_call->error;
11169 11177                                  list_remove(&rp->r_indelmap, delmap_call);
11170 11178                                  mutex_exit(&rp->r_statelock);
11171 11179                                  nfs4_free_delmapcall(delmap_call);
11172 11180                                  return (1);
11173 11181                          }
11174 11182                  }
11175 11183          }
11176 11184          mutex_exit(&rp->r_statelock);
11177 11185          return (0);
11178 11186  }
11179 11187  
11180 11188  /*
11181 11189   * Remove some pages from an mmap'd vnode.  Just update the
11182 11190   * count of pages.  If doing close-to-open, then flush and
11183 11191   * commit all of the pages associated with this file.
11184 11192   * Otherwise, start an asynchronous page flush to write out
11185 11193   * any dirty pages.  This will also associate a credential
11186 11194   * with the rnode which can be used to write the pages.
11187 11195   */
11188 11196  /* ARGSUSED */
11189 11197  static void
11190 11198  nfs4_delmap_callback(struct as *as, void *arg, uint_t event)
11191 11199  {
11192 11200          nfs4_error_t            e = { 0, NFS4_OK, RPC_SUCCESS };
11193 11201          rnode4_t                *rp;
11194 11202          mntinfo4_t              *mi;
11195 11203          nfs4_delmap_args_t      *dmapp = (nfs4_delmap_args_t *)arg;
11196 11204  
11197 11205          rp = VTOR4(dmapp->vp);
11198 11206          mi = VTOMI4(dmapp->vp);
11199 11207  
11200 11208          atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(dmapp->len));
11201 11209          ASSERT(rp->r_mapcnt >= 0);
11202 11210  
11203 11211          /*
11204 11212           * Initiate a page flush and potential commit if there are
11205 11213           * pages, the file system was not mounted readonly, the segment
11206 11214           * was mapped shared, and the pages themselves were writeable.
11207 11215           */
11208 11216          if (nfs4_has_pages(dmapp->vp) &&
11209 11217              !(dmapp->vp->v_vfsp->vfs_flag & VFS_RDONLY) &&
11210 11218              dmapp->flags == MAP_SHARED && (dmapp->maxprot & PROT_WRITE)) {
11211 11219                  mutex_enter(&rp->r_statelock);
11212 11220                  rp->r_flags |= R4DIRTY;
11213 11221                  mutex_exit(&rp->r_statelock);
11214 11222                  e.error = nfs4_putpage_commit(dmapp->vp, dmapp->off,
11215 11223                      dmapp->len, dmapp->cr);
11216 11224                  if (!e.error) {
11217 11225                          mutex_enter(&rp->r_statelock);
11218 11226                          e.error = rp->r_error;
11219 11227                          rp->r_error = 0;
11220 11228                          mutex_exit(&rp->r_statelock);
11221 11229                  }
11222 11230          } else
11223 11231                  e.error = 0;
11224 11232  
11225 11233          if ((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO))
11226 11234                  (void) nfs4_putpage(dmapp->vp, dmapp->off, dmapp->len,
11227 11235                      B_INVAL, dmapp->cr, NULL);
11228 11236  
11229 11237          if (e.error) {
11230 11238                  e.stat = puterrno4(e.error);
11231 11239                  nfs4_queue_fact(RF_DELMAP_CB_ERR, mi, e.stat, 0,
11232 11240                      OP_COMMIT, FALSE, NULL, 0, dmapp->vp);
11233 11241                  dmapp->caller->error = e.error;
11234 11242          }
11235 11243  
11236 11244          /* Check to see if we need to close the file */
11237 11245  
11238 11246          if (dmapp->vp->v_type == VREG) {
11239 11247                  nfs4close_one(dmapp->vp, NULL, dmapp->cr, 0, NULL, &e,
11240 11248                      CLOSE_DELMAP, dmapp->len, dmapp->maxprot, dmapp->flags);
11241 11249  
11242 11250                  if (e.error != 0 || e.stat != NFS4_OK) {
11243 11251                          /*
11244 11252                           * Since it is possible that e.error == 0 and
11245 11253                           * e.stat != NFS4_OK (and vice versa),
11246 11254                           * we do the proper checking in order to get both
11247 11255                           * e.error and e.stat reporting the correct info.
11248 11256                           */
11249 11257                          if (e.stat == NFS4_OK)
11250 11258                                  e.stat = puterrno4(e.error);
11251 11259                          if (e.error == 0)
11252 11260                                  e.error = geterrno4(e.stat);
11253 11261  
11254 11262                          nfs4_queue_fact(RF_DELMAP_CB_ERR, mi, e.stat, 0,
11255 11263                              OP_CLOSE, FALSE, NULL, 0, dmapp->vp);
11256 11264                          dmapp->caller->error = e.error;
11257 11265                  }
11258 11266          }
11259 11267  
11260 11268          (void) as_delete_callback(as, arg);
11261 11269          kmem_free(dmapp, sizeof (nfs4_delmap_args_t));
11262 11270  }
11263 11271  
11264 11272  
11265 11273  static uint_t
11266 11274  fattr4_maxfilesize_to_bits(uint64_t ll)
11267 11275  {
11268 11276          uint_t l = 1;
11269 11277  
11270 11278          if (ll == 0) {
11271 11279                  return (0);
11272 11280          }
11273 11281  
11274 11282          if (ll & 0xffffffff00000000) {
11275 11283                  l += 32; ll >>= 32;
11276 11284          }
11277 11285          if (ll & 0xffff0000) {
11278 11286                  l += 16; ll >>= 16;
11279 11287          }
11280 11288          if (ll & 0xff00) {
11281 11289                  l += 8; ll >>= 8;
11282 11290          }
11283 11291          if (ll & 0xf0) {
11284 11292                  l += 4; ll >>= 4;
11285 11293          }
11286 11294          if (ll & 0xc) {
11287 11295                  l += 2; ll >>= 2;
11288 11296          }
11289 11297          if (ll & 0x2) {
11290 11298                  l += 1;
11291 11299          }
11292 11300          return (l);
11293 11301  }
11294 11302  
11295 11303  static int
11296 11304  nfs4_have_xattrs(vnode_t *vp, ulong_t *valp, cred_t *cr)
11297 11305  {
11298 11306          vnode_t *avp = NULL;
11299 11307          int error;
11300 11308  
11301 11309          if ((error = nfs4lookup_xattr(vp, "", &avp,
11302 11310              LOOKUP_XATTR, cr)) == 0)
11303 11311                  error = do_xattr_exists_check(avp, valp, cr);
11304 11312          if (avp)
11305 11313                  VN_RELE(avp);
11306 11314  
11307 11315          return (error);
11308 11316  }
11309 11317  
11310 11318  /* ARGSUSED */
11311 11319  int
11312 11320  nfs4_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
11313 11321          caller_context_t *ct)
11314 11322  {
11315 11323          int error;
11316 11324          hrtime_t t;
11317 11325          rnode4_t *rp;
11318 11326          nfs4_ga_res_t gar;
11319 11327          nfs4_ga_ext_res_t ger;
11320 11328  
11321 11329          gar.n4g_ext_res = &ger;
11322 11330  
11323 11331          if (nfs_zone() != VTOMI4(vp)->mi_zone)
11324 11332                  return (EIO);
11325 11333          if (cmd == _PC_PATH_MAX || cmd == _PC_SYMLINK_MAX) {
11326 11334                  *valp = MAXPATHLEN;
11327 11335                  return (0);
11328 11336          }
11329 11337          if (cmd == _PC_ACL_ENABLED) {
11330 11338                  *valp = _ACL_ACE_ENABLED;
11331 11339                  return (0);
11332 11340          }
11333 11341  
11334 11342          rp = VTOR4(vp);
11335 11343          if (cmd == _PC_XATTR_EXISTS) {
11336 11344                  /*
11337 11345                   * The existence of the xattr directory is not sufficient
11338 11346                   * for determining whether generic user attributes exists.
11339 11347                   * The attribute directory could only be a transient directory
11340 11348                   * used for Solaris sysattr support.  Do a small readdir
11341 11349                   * to verify if the only entries are sysattrs or not.
11342 11350                   *
11343 11351                   * pc4_xattr_valid can be only be trusted when r_xattr_dir
11344 11352                   * is NULL.  Once the xadir vp exists, we can create xattrs,
11345 11353                   * and we don't have any way to update the "base" object's
11346 11354                   * pc4_xattr_exists from the xattr or xadir.  Maybe FEM
11347 11355                   * could help out.
11348 11356                   */
11349 11357                  if (ATTRCACHE4_VALID(vp) && rp->r_pathconf.pc4_xattr_valid &&
11350 11358                      rp->r_xattr_dir == NULL) {
11351 11359                          return (nfs4_have_xattrs(vp, valp, cr));
11352 11360                  }
11353 11361          } else {  /* OLD CODE */
11354 11362                  if (ATTRCACHE4_VALID(vp)) {
11355 11363                          mutex_enter(&rp->r_statelock);
11356 11364                          if (rp->r_pathconf.pc4_cache_valid) {
11357 11365                                  error = 0;
11358 11366                                  switch (cmd) {
11359 11367                                  case _PC_FILESIZEBITS:
11360 11368                                          *valp =
11361 11369                                              rp->r_pathconf.pc4_filesizebits;
11362 11370                                          break;
11363 11371                                  case _PC_LINK_MAX:
11364 11372                                          *valp =
11365 11373                                              rp->r_pathconf.pc4_link_max;
11366 11374                                          break;
11367 11375                                  case _PC_NAME_MAX:
11368 11376                                          *valp =
11369 11377                                              rp->r_pathconf.pc4_name_max;
11370 11378                                          break;
11371 11379                                  case _PC_CHOWN_RESTRICTED:
11372 11380                                          *valp =
11373 11381                                              rp->r_pathconf.pc4_chown_restricted;
11374 11382                                          break;
11375 11383                                  case _PC_NO_TRUNC:
11376 11384                                          *valp =
11377 11385                                              rp->r_pathconf.pc4_no_trunc;
11378 11386                                          break;
11379 11387                                  default:
11380 11388                                          error = EINVAL;
11381 11389                                          break;
11382 11390                                  }
11383 11391                                  mutex_exit(&rp->r_statelock);
11384 11392  #ifdef DEBUG
11385 11393                                  nfs4_pathconf_cache_hits++;
11386 11394  #endif
11387 11395                                  return (error);
11388 11396                          }
11389 11397                          mutex_exit(&rp->r_statelock);
11390 11398                  }
11391 11399          }
11392 11400  #ifdef DEBUG
11393 11401          nfs4_pathconf_cache_misses++;
11394 11402  #endif
11395 11403  
11396 11404          t = gethrtime();
11397 11405  
11398 11406          error = nfs4_attr_otw(vp, TAG_PATHCONF, &gar, NFS4_PATHCONF_MASK, cr);
11399 11407  
11400 11408          if (error) {
11401 11409                  mutex_enter(&rp->r_statelock);
11402 11410                  rp->r_pathconf.pc4_cache_valid = FALSE;
11403 11411                  rp->r_pathconf.pc4_xattr_valid = FALSE;
11404 11412                  mutex_exit(&rp->r_statelock);
11405 11413                  return (error);
11406 11414          }
11407 11415  
11408 11416          /* interpret the max filesize */
11409 11417          gar.n4g_ext_res->n4g_pc4.pc4_filesizebits =
11410 11418              fattr4_maxfilesize_to_bits(gar.n4g_ext_res->n4g_maxfilesize);
11411 11419  
11412 11420          /* Store the attributes we just received */
11413 11421          nfs4_attr_cache(vp, &gar, t, cr, TRUE, NULL);
11414 11422  
11415 11423          switch (cmd) {
11416 11424          case _PC_FILESIZEBITS:
11417 11425                  *valp = gar.n4g_ext_res->n4g_pc4.pc4_filesizebits;
11418 11426                  break;
11419 11427          case _PC_LINK_MAX:
11420 11428                  *valp = gar.n4g_ext_res->n4g_pc4.pc4_link_max;
11421 11429                  break;
11422 11430          case _PC_NAME_MAX:
11423 11431                  *valp = gar.n4g_ext_res->n4g_pc4.pc4_name_max;
11424 11432                  break;
11425 11433          case _PC_CHOWN_RESTRICTED:
11426 11434                  *valp = gar.n4g_ext_res->n4g_pc4.pc4_chown_restricted;
11427 11435                  break;
11428 11436          case _PC_NO_TRUNC:
11429 11437                  *valp = gar.n4g_ext_res->n4g_pc4.pc4_no_trunc;
11430 11438                  break;
11431 11439          case _PC_XATTR_EXISTS:
11432 11440                  if (gar.n4g_ext_res->n4g_pc4.pc4_xattr_exists) {
11433 11441                          if (error = nfs4_have_xattrs(vp, valp, cr))
11434 11442                                  return (error);
11435 11443                  }
11436 11444                  break;
11437 11445          default:
11438 11446                  return (EINVAL);
11439 11447          }
11440 11448  
11441 11449          return (0);
11442 11450  }
11443 11451  
11444 11452  /*
11445 11453   * Called by async thread to do synchronous pageio. Do the i/o, wait
11446 11454   * for it to complete, and cleanup the page list when done.
11447 11455   */
11448 11456  static int
11449 11457  nfs4_sync_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
11450 11458      int flags, cred_t *cr)
11451 11459  {
11452 11460          int error;
11453 11461  
11454 11462          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
11455 11463  
11456 11464          error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
11457 11465          if (flags & B_READ)
11458 11466                  pvn_read_done(pp, (error ? B_ERROR : 0) | flags);
11459 11467          else
11460 11468                  pvn_write_done(pp, (error ? B_ERROR : 0) | flags);
11461 11469          return (error);
11462 11470  }
11463 11471  
11464 11472  /* ARGSUSED */
11465 11473  static int
11466 11474  nfs4_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
11467 11475          int flags, cred_t *cr, caller_context_t *ct)
11468 11476  {
11469 11477          int error;
11470 11478          rnode4_t *rp;
11471 11479  
11472 11480          if (!(flags & B_ASYNC) && nfs_zone() != VTOMI4(vp)->mi_zone)
11473 11481                  return (EIO);
11474 11482  
11475 11483          if (pp == NULL)
11476 11484                  return (EINVAL);
11477 11485  
11478 11486          rp = VTOR4(vp);
11479 11487          mutex_enter(&rp->r_statelock);
11480 11488          rp->r_count++;
11481 11489          mutex_exit(&rp->r_statelock);
11482 11490  
11483 11491          if (flags & B_ASYNC) {
11484 11492                  error = nfs4_async_pageio(vp, pp, io_off, io_len, flags, cr,
11485 11493                      nfs4_sync_pageio);
11486 11494          } else
11487 11495                  error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
11488 11496          mutex_enter(&rp->r_statelock);
11489 11497          rp->r_count--;
11490 11498          cv_broadcast(&rp->r_cv);
11491 11499          mutex_exit(&rp->r_statelock);
11492 11500          return (error);
11493 11501  }
11494 11502  
11495 11503  /* ARGSUSED */
11496 11504  static void
11497 11505  nfs4_dispose(vnode_t *vp, page_t *pp, int fl, int dn, cred_t *cr,
11498 11506          caller_context_t *ct)
11499 11507  {
11500 11508          int error;
11501 11509          rnode4_t *rp;
11502 11510          page_t *plist;
11503 11511          page_t *pptr;
11504 11512          offset3 offset;
11505 11513          count3 len;
11506 11514          k_sigset_t smask;
11507 11515  
11508 11516          /*
11509 11517           * We should get called with fl equal to either B_FREE or
11510 11518           * B_INVAL.  Any other value is illegal.
11511 11519           *
11512 11520           * The page that we are either supposed to free or destroy
11513 11521           * should be exclusive locked and its io lock should not
11514 11522           * be held.
11515 11523           */
11516 11524          ASSERT(fl == B_FREE || fl == B_INVAL);
11517 11525          ASSERT((PAGE_EXCL(pp) && !page_iolock_assert(pp)) || panicstr);
11518 11526  
11519 11527          rp = VTOR4(vp);
11520 11528  
11521 11529          /*
11522 11530           * If the page doesn't need to be committed or we shouldn't
11523 11531           * even bother attempting to commit it, then just make sure
11524 11532           * that the p_fsdata byte is clear and then either free or
11525 11533           * destroy the page as appropriate.
11526 11534           */
11527 11535          if (pp->p_fsdata == C_NOCOMMIT || (rp->r_flags & R4STALE)) {
11528 11536                  pp->p_fsdata = C_NOCOMMIT;
11529 11537                  if (fl == B_FREE)
11530 11538                          page_free(pp, dn);
11531 11539                  else
11532 11540                          page_destroy(pp, dn);
11533 11541                  return;
11534 11542          }
11535 11543  
11536 11544          /*
11537 11545           * If there is a page invalidation operation going on, then
11538 11546           * if this is one of the pages being destroyed, then just
11539 11547           * clear the p_fsdata byte and then either free or destroy
11540 11548           * the page as appropriate.
11541 11549           */
11542 11550          mutex_enter(&rp->r_statelock);
11543 11551          if ((rp->r_flags & R4TRUNCATE) && pp->p_offset >= rp->r_truncaddr) {
11544 11552                  mutex_exit(&rp->r_statelock);
11545 11553                  pp->p_fsdata = C_NOCOMMIT;
11546 11554                  if (fl == B_FREE)
11547 11555                          page_free(pp, dn);
11548 11556                  else
11549 11557                          page_destroy(pp, dn);
11550 11558                  return;
11551 11559          }
11552 11560  
11553 11561          /*
11554 11562           * If we are freeing this page and someone else is already
11555 11563           * waiting to do a commit, then just unlock the page and
11556 11564           * return.  That other thread will take care of commiting
11557 11565           * this page.  The page can be freed sometime after the
11558 11566           * commit has finished.  Otherwise, if the page is marked
11559 11567           * as delay commit, then we may be getting called from
11560 11568           * pvn_write_done, one page at a time.   This could result
11561 11569           * in one commit per page, so we end up doing lots of small
11562 11570           * commits instead of fewer larger commits.  This is bad,
11563 11571           * we want do as few commits as possible.
11564 11572           */
11565 11573          if (fl == B_FREE) {
11566 11574                  if (rp->r_flags & R4COMMITWAIT) {
11567 11575                          page_unlock(pp);
11568 11576                          mutex_exit(&rp->r_statelock);
11569 11577                          return;
11570 11578                  }
11571 11579                  if (pp->p_fsdata == C_DELAYCOMMIT) {
11572 11580                          pp->p_fsdata = C_COMMIT;
11573 11581                          page_unlock(pp);
11574 11582                          mutex_exit(&rp->r_statelock);
11575 11583                          return;
11576 11584                  }
11577 11585          }
11578 11586  
11579 11587          /*
11580 11588           * Check to see if there is a signal which would prevent an
11581 11589           * attempt to commit the pages from being successful.  If so,
11582 11590           * then don't bother with all of the work to gather pages and
11583 11591           * generate the unsuccessful RPC.  Just return from here and
11584 11592           * let the page be committed at some later time.
11585 11593           */
11586 11594          sigintr(&smask, VTOMI4(vp)->mi_flags & MI4_INT);
11587 11595          if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING)) {
11588 11596                  sigunintr(&smask);
11589 11597                  page_unlock(pp);
11590 11598                  mutex_exit(&rp->r_statelock);
11591 11599                  return;
11592 11600          }
11593 11601          sigunintr(&smask);
11594 11602  
11595 11603          /*
11596 11604           * We are starting to need to commit pages, so let's try
11597 11605           * to commit as many as possible at once to reduce the
11598 11606           * overhead.
11599 11607           *
11600 11608           * Set the `commit inprogress' state bit.  We must
11601 11609           * first wait until any current one finishes.  Then
11602 11610           * we initialize the c_pages list with this page.
11603 11611           */
11604 11612          while (rp->r_flags & R4COMMIT) {
11605 11613                  rp->r_flags |= R4COMMITWAIT;
11606 11614                  cv_wait(&rp->r_commit.c_cv, &rp->r_statelock);
11607 11615                  rp->r_flags &= ~R4COMMITWAIT;
11608 11616          }
11609 11617          rp->r_flags |= R4COMMIT;
11610 11618          mutex_exit(&rp->r_statelock);
11611 11619          ASSERT(rp->r_commit.c_pages == NULL);
11612 11620          rp->r_commit.c_pages = pp;
11613 11621          rp->r_commit.c_commbase = (offset3)pp->p_offset;
11614 11622          rp->r_commit.c_commlen = PAGESIZE;
11615 11623  
11616 11624          /*
11617 11625           * Gather together all other pages which can be committed.
11618 11626           * They will all be chained off r_commit.c_pages.
11619 11627           */
11620 11628          nfs4_get_commit(vp);
11621 11629  
11622 11630          /*
11623 11631           * Clear the `commit inprogress' status and disconnect
11624 11632           * the list of pages to be committed from the rnode.
11625 11633           * At this same time, we also save the starting offset
11626 11634           * and length of data to be committed on the server.
11627 11635           */
11628 11636          plist = rp->r_commit.c_pages;
11629 11637          rp->r_commit.c_pages = NULL;
11630 11638          offset = rp->r_commit.c_commbase;
11631 11639          len = rp->r_commit.c_commlen;
11632 11640          mutex_enter(&rp->r_statelock);
11633 11641          rp->r_flags &= ~R4COMMIT;
11634 11642          cv_broadcast(&rp->r_commit.c_cv);
11635 11643          mutex_exit(&rp->r_statelock);
11636 11644  
11637 11645          if (curproc == proc_pageout || curproc == proc_fsflush ||
11638 11646              nfs_zone() != VTOMI4(vp)->mi_zone) {
11639 11647                  nfs4_async_commit(vp, plist, offset, len,
11640 11648                      cr, do_nfs4_async_commit);
11641 11649                  return;
11642 11650          }
11643 11651  
11644 11652          /*
11645 11653           * Actually generate the COMMIT op over the wire operation.
11646 11654           */
11647 11655          error = nfs4_commit(vp, (offset4)offset, (count4)len, cr);
11648 11656  
11649 11657          /*
11650 11658           * If we got an error during the commit, just unlock all
11651 11659           * of the pages.  The pages will get retransmitted to the
11652 11660           * server during a putpage operation.
11653 11661           */
11654 11662          if (error) {
11655 11663                  while (plist != NULL) {
11656 11664                          pptr = plist;
11657 11665                          page_sub(&plist, pptr);
11658 11666                          page_unlock(pptr);
11659 11667                  }
11660 11668                  return;
11661 11669          }
11662 11670  
11663 11671          /*
11664 11672           * We've tried as hard as we can to commit the data to stable
11665 11673           * storage on the server.  We just unlock the rest of the pages
11666 11674           * and clear the commit required state.  They will be put
11667 11675           * onto the tail of the cachelist if they are nolonger
11668 11676           * mapped.
11669 11677           */
11670 11678          while (plist != pp) {
11671 11679                  pptr = plist;
11672 11680                  page_sub(&plist, pptr);
11673 11681                  pptr->p_fsdata = C_NOCOMMIT;
11674 11682                  page_unlock(pptr);
11675 11683          }
11676 11684  
11677 11685          /*
11678 11686           * It is possible that nfs4_commit didn't return error but
11679 11687           * some other thread has modified the page we are going
11680 11688           * to free/destroy.
11681 11689           *    In this case we need to rewrite the page. Do an explicit check
11682 11690           * before attempting to free/destroy the page. If modified, needs to
11683 11691           * be rewritten so unlock the page and return.
11684 11692           */
11685 11693          if (hat_ismod(pp)) {
11686 11694                  pp->p_fsdata = C_NOCOMMIT;
11687 11695                  page_unlock(pp);
11688 11696                  return;
11689 11697          }
11690 11698  
11691 11699          /*
11692 11700           * Now, as appropriate, either free or destroy the page
11693 11701           * that we were called with.
11694 11702           */
11695 11703          pp->p_fsdata = C_NOCOMMIT;
11696 11704          if (fl == B_FREE)
11697 11705                  page_free(pp, dn);
11698 11706          else
11699 11707                  page_destroy(pp, dn);
11700 11708  }
11701 11709  
11702 11710  /*
11703 11711   * Commit requires that the current fh be the file written to.
11704 11712   * The compound op structure is:
11705 11713   *      PUTFH(file), COMMIT
11706 11714   */
11707 11715  static int
11708 11716  nfs4_commit(vnode_t *vp, offset4 offset, count4 count, cred_t *cr)
11709 11717  {
11710 11718          COMPOUND4args_clnt args;
11711 11719          COMPOUND4res_clnt res;
11712 11720          COMMIT4res *cm_res;
11713 11721          nfs_argop4 argop[2];
11714 11722          nfs_resop4 *resop;
11715 11723          int doqueue;
11716 11724          mntinfo4_t *mi;
11717 11725          rnode4_t *rp;
11718 11726          cred_t *cred_otw = NULL;
11719 11727          bool_t needrecov = FALSE;
11720 11728          nfs4_recov_state_t recov_state;
11721 11729          nfs4_open_stream_t *osp = NULL;
11722 11730          bool_t first_time = TRUE;       /* first time getting OTW cred */
11723 11731          bool_t last_time = FALSE;       /* last time getting OTW cred */
11724 11732          nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
11725 11733  
11726 11734          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
11727 11735  
11728 11736          rp = VTOR4(vp);
11729 11737  
11730 11738          mi = VTOMI4(vp);
11731 11739          recov_state.rs_flags = 0;
11732 11740          recov_state.rs_num_retry_despite_err = 0;
11733 11741  get_commit_cred:
11734 11742          /*
11735 11743           * Releases the osp, if a valid open stream is provided.
11736 11744           * Puts a hold on the cred_otw and the new osp (if found).
11737 11745           */
11738 11746          cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp,
11739 11747              &first_time, &last_time);
11740 11748          args.ctag = TAG_COMMIT;
11741 11749  recov_retry:
11742 11750          /*
11743 11751           * Commit ops: putfh file; commit
11744 11752           */
11745 11753          args.array_len = 2;
11746 11754          args.array = argop;
11747 11755  
11748 11756          e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_COMMIT,
11749 11757              &recov_state, NULL);
11750 11758          if (e.error) {
11751 11759                  crfree(cred_otw);
11752 11760                  if (osp != NULL)
11753 11761                          open_stream_rele(osp, rp);
11754 11762                  return (e.error);
11755 11763          }
11756 11764  
11757 11765          /* putfh directory */
11758 11766          argop[0].argop = OP_CPUTFH;
11759 11767          argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
11760 11768  
11761 11769          /* commit */
11762 11770          argop[1].argop = OP_COMMIT;
11763 11771          argop[1].nfs_argop4_u.opcommit.offset = offset;
11764 11772          argop[1].nfs_argop4_u.opcommit.count = count;
11765 11773  
11766 11774          doqueue = 1;
11767 11775          rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, &e);
11768 11776  
11769 11777          needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
11770 11778          if (!needrecov && e.error) {
11771 11779                  nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, &recov_state,
11772 11780                      needrecov);
11773 11781                  crfree(cred_otw);
11774 11782                  if (e.error == EACCES && last_time == FALSE)
11775 11783                          goto get_commit_cred;
11776 11784                  if (osp != NULL)
11777 11785                          open_stream_rele(osp, rp);
11778 11786                  return (e.error);
11779 11787          }
11780 11788  
11781 11789          if (needrecov) {
11782 11790                  if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
11783 11791                      NULL, OP_COMMIT, NULL, NULL, NULL) == FALSE) {
11784 11792                          nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT,
11785 11793                              &recov_state, needrecov);
11786 11794                          if (!e.error)
11787 11795                                  (void) xdr_free(xdr_COMPOUND4res_clnt,
11788 11796                                      (caddr_t)&res);
11789 11797                          goto recov_retry;
11790 11798                  }
11791 11799                  if (e.error) {
11792 11800                          nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT,
11793 11801                              &recov_state, needrecov);
11794 11802                          crfree(cred_otw);
11795 11803                          if (osp != NULL)
11796 11804                                  open_stream_rele(osp, rp);
11797 11805                          return (e.error);
11798 11806                  }
11799 11807                  /* fall through for res.status case */
11800 11808          }
11801 11809  
11802 11810          if (res.status) {
11803 11811                  e.error = geterrno4(res.status);
11804 11812                  if (e.error == EACCES && last_time == FALSE) {
11805 11813                          crfree(cred_otw);
11806 11814                          nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT,
11807 11815                              &recov_state, needrecov);
11808 11816                          (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
11809 11817                          goto get_commit_cred;
11810 11818                  }
11811 11819                  /*
11812 11820                   * Can't do a nfs4_purge_stale_fh here because this
11813 11821                   * can cause a deadlock.  nfs4_commit can
11814 11822                   * be called from nfs4_dispose which can be called
11815 11823                   * indirectly via pvn_vplist_dirty.  nfs4_purge_stale_fh
11816 11824                   * can call back to pvn_vplist_dirty.
11817 11825                   */
11818 11826                  if (e.error == ESTALE) {
11819 11827                          mutex_enter(&rp->r_statelock);
11820 11828                          rp->r_flags |= R4STALE;
11821 11829                          if (!rp->r_error)
11822 11830                                  rp->r_error = e.error;
11823 11831                          mutex_exit(&rp->r_statelock);
11824 11832                          PURGE_ATTRCACHE4(vp);
11825 11833                  } else {
11826 11834                          mutex_enter(&rp->r_statelock);
11827 11835                          if (!rp->r_error)
11828 11836                                  rp->r_error = e.error;
11829 11837                          mutex_exit(&rp->r_statelock);
11830 11838                  }
11831 11839          } else {
11832 11840                  ASSERT(rp->r_flags & R4HAVEVERF);
11833 11841                  resop = &res.array[1];  /* commit res */
11834 11842                  cm_res = &resop->nfs_resop4_u.opcommit;
11835 11843                  mutex_enter(&rp->r_statelock);
11836 11844                  if (cm_res->writeverf == rp->r_writeverf) {
11837 11845                          mutex_exit(&rp->r_statelock);
11838 11846                          (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
11839 11847                          nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT,
11840 11848                              &recov_state, needrecov);
11841 11849                          crfree(cred_otw);
11842 11850                          if (osp != NULL)
11843 11851                                  open_stream_rele(osp, rp);
11844 11852                          return (0);
11845 11853                  }
11846 11854                  nfs4_set_mod(vp);
11847 11855                  rp->r_writeverf = cm_res->writeverf;
11848 11856                  mutex_exit(&rp->r_statelock);
11849 11857                  e.error = NFS_VERF_MISMATCH;
11850 11858          }
11851 11859  
11852 11860          (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
11853 11861          nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, &recov_state, needrecov);
11854 11862          crfree(cred_otw);
11855 11863          if (osp != NULL)
11856 11864                  open_stream_rele(osp, rp);
11857 11865  
11858 11866          return (e.error);
11859 11867  }
11860 11868  
11861 11869  static void
11862 11870  nfs4_set_mod(vnode_t *vp)
11863 11871  {
11864 11872          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
11865 11873  
11866 11874          /* make sure we're looking at the master vnode, not a shadow */
11867 11875          pvn_vplist_setdirty(RTOV4(VTOR4(vp)), nfs_setmod_check);
11868 11876  }
11869 11877  
11870 11878  /*
11871 11879   * This function is used to gather a page list of the pages which
11872 11880   * can be committed on the server.
11873 11881   *
11874 11882   * The calling thread must have set R4COMMIT.  This bit is used to
11875 11883   * serialize access to the commit structure in the rnode.  As long
11876 11884   * as the thread has set R4COMMIT, then it can manipulate the commit
11877 11885   * structure without requiring any other locks.
11878 11886   *
11879 11887   * When this function is called from nfs4_dispose() the page passed
11880 11888   * into nfs4_dispose() will be SE_EXCL locked, and so this function
11881 11889   * will skip it. This is not a problem since we initially add the
11882 11890   * page to the r_commit page list.
11883 11891   *
11884 11892   */
11885 11893  static void
11886 11894  nfs4_get_commit(vnode_t *vp)
11887 11895  {
11888 11896          rnode4_t *rp;
11889 11897          page_t *pp;
11890 11898          kmutex_t *vphm;
11891 11899  
11892 11900          rp = VTOR4(vp);
11893 11901  
11894 11902          ASSERT(rp->r_flags & R4COMMIT);
11895 11903  
11896 11904          /* make sure we're looking at the master vnode, not a shadow */
11897 11905  
11898 11906          if (IS_SHADOW(vp, rp))
11899 11907                  vp = RTOV4(rp);
11900 11908  
11901 11909          vphm = page_vnode_mutex(vp);
11902 11910          mutex_enter(vphm);
11903 11911  
11904 11912          /*
11905 11913           * If there are no pages associated with this vnode, then
11906 11914           * just return.
11907 11915           */
11908 11916          if ((pp = vp->v_pages) == NULL) {
11909 11917                  mutex_exit(vphm);
11910 11918                  return;
11911 11919          }
11912 11920  
11913 11921          /*
11914 11922           * Step through all of the pages associated with this vnode
11915 11923           * looking for pages which need to be committed.
11916 11924           */
11917 11925          do {
11918 11926                  /* Skip marker pages. */
11919 11927                  if (pp->p_hash == PVN_VPLIST_HASH_TAG)
11920 11928                          continue;
11921 11929  
11922 11930                  /*
11923 11931                   * First short-cut everything (without the page_lock)
11924 11932                   * and see if this page does not need to be committed
11925 11933                   * or is modified if so then we'll just skip it.
11926 11934                   */
11927 11935                  if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp))
11928 11936                          continue;
11929 11937  
11930 11938                  /*
11931 11939                   * Attempt to lock the page.  If we can't, then
11932 11940                   * someone else is messing with it or we have been
11933 11941                   * called from nfs4_dispose and this is the page that
11934 11942                   * nfs4_dispose was called with.. anyway just skip it.
11935 11943                   */
11936 11944                  if (!page_trylock(pp, SE_EXCL))
11937 11945                          continue;
11938 11946  
11939 11947                  /*
11940 11948                   * Lets check again now that we have the page lock.
11941 11949                   */
11942 11950                  if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) {
11943 11951                          page_unlock(pp);
11944 11952                          continue;
11945 11953                  }
11946 11954  
11947 11955                  /* this had better not be a free page */
11948 11956                  ASSERT(PP_ISFREE(pp) == 0);
11949 11957  
11950 11958                  /*
11951 11959                   * The page needs to be committed and we locked it.
11952 11960                   * Update the base and length parameters and add it
11953 11961                   * to r_pages.
11954 11962                   */
11955 11963                  if (rp->r_commit.c_pages == NULL) {
11956 11964                          rp->r_commit.c_commbase = (offset3)pp->p_offset;
11957 11965                          rp->r_commit.c_commlen = PAGESIZE;
11958 11966                  } else if (pp->p_offset < rp->r_commit.c_commbase) {
11959 11967                          rp->r_commit.c_commlen = rp->r_commit.c_commbase -
11960 11968                              (offset3)pp->p_offset + rp->r_commit.c_commlen;
11961 11969                          rp->r_commit.c_commbase = (offset3)pp->p_offset;
11962 11970                  } else if ((rp->r_commit.c_commbase + rp->r_commit.c_commlen)
11963 11971                      <= pp->p_offset) {
11964 11972                          rp->r_commit.c_commlen = (offset3)pp->p_offset -
11965 11973                              rp->r_commit.c_commbase + PAGESIZE;
11966 11974                  }
11967 11975                  page_add(&rp->r_commit.c_pages, pp);
11968 11976          } while ((pp = pp->p_vpnext) != vp->v_pages);
11969 11977  
11970 11978          mutex_exit(vphm);
11971 11979  }
11972 11980  
11973 11981  /*
11974 11982   * This routine is used to gather together a page list of the pages
11975 11983   * which are to be committed on the server.  This routine must not
11976 11984   * be called if the calling thread holds any locked pages.
11977 11985   *
11978 11986   * The calling thread must have set R4COMMIT.  This bit is used to
11979 11987   * serialize access to the commit structure in the rnode.  As long
11980 11988   * as the thread has set R4COMMIT, then it can manipulate the commit
11981 11989   * structure without requiring any other locks.
11982 11990   */
11983 11991  static void
11984 11992  nfs4_get_commit_range(vnode_t *vp, u_offset_t soff, size_t len)
11985 11993  {
11986 11994  
11987 11995          rnode4_t *rp;
11988 11996          page_t *pp;
11989 11997          u_offset_t end;
11990 11998          u_offset_t off;
11991 11999          ASSERT(len != 0);
11992 12000          rp = VTOR4(vp);
11993 12001          ASSERT(rp->r_flags & R4COMMIT);
11994 12002  
11995 12003          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
11996 12004  
11997 12005          /* make sure we're looking at the master vnode, not a shadow */
11998 12006  
11999 12007          if (IS_SHADOW(vp, rp))
12000 12008                  vp = RTOV4(rp);
12001 12009  
12002 12010          /*
12003 12011           * If there are no pages associated with this vnode, then
12004 12012           * just return.
12005 12013           */
12006 12014          if ((pp = vp->v_pages) == NULL)
12007 12015                  return;
12008 12016          /*
12009 12017           * Calculate the ending offset.
12010 12018           */
12011 12019          end = soff + len;
12012 12020          for (off = soff; off < end; off += PAGESIZE) {
12013 12021                  /*
12014 12022                   * Lookup each page by vp, offset.
12015 12023                   */
12016 12024                  if ((pp = page_lookup_nowait(vp, off, SE_EXCL)) == NULL)
12017 12025                          continue;
12018 12026                  /*
12019 12027                   * If this page does not need to be committed or is
12020 12028                   * modified, then just skip it.
12021 12029                   */
12022 12030                  if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) {
12023 12031                          page_unlock(pp);
12024 12032                          continue;
12025 12033                  }
12026 12034  
12027 12035                  ASSERT(PP_ISFREE(pp) == 0);
12028 12036                  /*
12029 12037                   * The page needs to be committed and we locked it.
12030 12038                   * Update the base and length parameters and add it
12031 12039                   * to r_pages.
12032 12040                   */
12033 12041                  if (rp->r_commit.c_pages == NULL) {
12034 12042                          rp->r_commit.c_commbase = (offset3)pp->p_offset;
12035 12043                          rp->r_commit.c_commlen = PAGESIZE;
12036 12044                  } else {
12037 12045                          rp->r_commit.c_commlen = (offset3)pp->p_offset -
12038 12046                              rp->r_commit.c_commbase + PAGESIZE;
12039 12047                  }
12040 12048                  page_add(&rp->r_commit.c_pages, pp);
12041 12049          }
12042 12050  }
12043 12051  
12044 12052  /*
12045 12053   * Called from nfs4_close(), nfs4_fsync() and nfs4_delmap().
12046 12054   * Flushes and commits data to the server.
12047 12055   */
12048 12056  static int
12049 12057  nfs4_putpage_commit(vnode_t *vp, offset_t poff, size_t plen, cred_t *cr)
12050 12058  {
12051 12059          int error;
12052 12060          verifier4 write_verf;
12053 12061          rnode4_t *rp = VTOR4(vp);
12054 12062  
12055 12063          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
12056 12064  
12057 12065          /*
12058 12066           * Flush the data portion of the file and then commit any
12059 12067           * portions which need to be committed.  This may need to
12060 12068           * be done twice if the server has changed state since
12061 12069           * data was last written.  The data will need to be
12062 12070           * rewritten to the server and then a new commit done.
12063 12071           *
12064 12072           * In fact, this may need to be done several times if the
12065 12073           * server is having problems and crashing while we are
12066 12074           * attempting to do this.
12067 12075           */
12068 12076  
12069 12077  top:
12070 12078          /*
12071 12079           * Do a flush based on the poff and plen arguments.  This
12072 12080           * will synchronously write out any modified pages in the
12073 12081           * range specified by (poff, plen). This starts all of the
12074 12082           * i/o operations which will be waited for in the next
12075 12083           * call to nfs4_putpage
12076 12084           */
12077 12085  
12078 12086          mutex_enter(&rp->r_statelock);
12079 12087          write_verf = rp->r_writeverf;
12080 12088          mutex_exit(&rp->r_statelock);
12081 12089  
12082 12090          error = nfs4_putpage(vp, poff, plen, B_ASYNC, cr, NULL);
12083 12091          if (error == EAGAIN)
12084 12092                  error = 0;
12085 12093  
12086 12094          /*
12087 12095           * Do a flush based on the poff and plen arguments.  This
12088 12096           * will synchronously write out any modified pages in the
12089 12097           * range specified by (poff, plen) and wait until all of
12090 12098           * the asynchronous i/o's in that range are done as well.
12091 12099           */
12092 12100          if (!error)
12093 12101                  error = nfs4_putpage(vp, poff, plen, 0, cr, NULL);
12094 12102  
12095 12103          if (error)
12096 12104                  return (error);
12097 12105  
12098 12106          mutex_enter(&rp->r_statelock);
12099 12107          if (rp->r_writeverf != write_verf) {
12100 12108                  mutex_exit(&rp->r_statelock);
12101 12109                  goto top;
12102 12110          }
12103 12111          mutex_exit(&rp->r_statelock);
12104 12112  
12105 12113          /*
12106 12114           * Now commit any pages which might need to be committed.
12107 12115           * If the error, NFS_VERF_MISMATCH, is returned, then
12108 12116           * start over with the flush operation.
12109 12117           */
12110 12118          error = nfs4_commit_vp(vp, poff, plen, cr, NFS4_WRITE_WAIT);
12111 12119  
12112 12120          if (error == NFS_VERF_MISMATCH)
12113 12121                  goto top;
12114 12122  
12115 12123          return (error);
12116 12124  }
12117 12125  
12118 12126  /*
12119 12127   * nfs4_commit_vp()  will wait for other pending commits and
12120 12128   * will either commit the whole file or a range, plen dictates
12121 12129   * if we commit whole file. a value of zero indicates the whole
12122 12130   * file. Called from nfs4_putpage_commit() or nfs4_sync_putapage()
12123 12131   */
12124 12132  static int
12125 12133  nfs4_commit_vp(vnode_t *vp, u_offset_t poff, size_t plen,
12126 12134      cred_t *cr, int wait_on_writes)
12127 12135  {
12128 12136          rnode4_t *rp;
12129 12137          page_t *plist;
12130 12138          offset3 offset;
12131 12139          count3 len;
12132 12140  
12133 12141          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
12134 12142  
12135 12143          rp = VTOR4(vp);
12136 12144  
12137 12145          /*
12138 12146           *  before we gather commitable pages make
12139 12147           *  sure there are no outstanding async writes
12140 12148           */
12141 12149          if (rp->r_count && wait_on_writes == NFS4_WRITE_WAIT) {
12142 12150                  mutex_enter(&rp->r_statelock);
12143 12151                  while (rp->r_count > 0) {
12144 12152                          cv_wait(&rp->r_cv, &rp->r_statelock);
12145 12153                  }
12146 12154                  mutex_exit(&rp->r_statelock);
12147 12155          }
12148 12156  
12149 12157          /*
12150 12158           * Set the `commit inprogress' state bit.  We must
12151 12159           * first wait until any current one finishes.
12152 12160           */
12153 12161          mutex_enter(&rp->r_statelock);
12154 12162          while (rp->r_flags & R4COMMIT) {
12155 12163                  rp->r_flags |= R4COMMITWAIT;
12156 12164                  cv_wait(&rp->r_commit.c_cv, &rp->r_statelock);
12157 12165                  rp->r_flags &= ~R4COMMITWAIT;
12158 12166          }
12159 12167          rp->r_flags |= R4COMMIT;
12160 12168          mutex_exit(&rp->r_statelock);
12161 12169  
12162 12170          /*
12163 12171           * Gather all of the pages which need to be
12164 12172           * committed.
12165 12173           */
12166 12174          if (plen == 0)
12167 12175                  nfs4_get_commit(vp);
12168 12176          else
12169 12177                  nfs4_get_commit_range(vp, poff, plen);
12170 12178  
12171 12179          /*
12172 12180           * Clear the `commit inprogress' bit and disconnect the
12173 12181           * page list which was gathered by nfs4_get_commit.
12174 12182           */
12175 12183          plist = rp->r_commit.c_pages;
12176 12184          rp->r_commit.c_pages = NULL;
12177 12185          offset = rp->r_commit.c_commbase;
12178 12186          len = rp->r_commit.c_commlen;
12179 12187          mutex_enter(&rp->r_statelock);
12180 12188          rp->r_flags &= ~R4COMMIT;
12181 12189          cv_broadcast(&rp->r_commit.c_cv);
12182 12190          mutex_exit(&rp->r_statelock);
12183 12191  
12184 12192          /*
12185 12193           * If any pages need to be committed, commit them and
12186 12194           * then unlock them so that they can be freed some
12187 12195           * time later.
12188 12196           */
12189 12197          if (plist == NULL)
12190 12198                  return (0);
12191 12199  
12192 12200          /*
12193 12201           * No error occurred during the flush portion
12194 12202           * of this operation, so now attempt to commit
12195 12203           * the data to stable storage on the server.
12196 12204           *
12197 12205           * This will unlock all of the pages on the list.
12198 12206           */
12199 12207          return (nfs4_sync_commit(vp, plist, offset, len, cr));
12200 12208  }
12201 12209  
12202 12210  static int
12203 12211  nfs4_sync_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
12204 12212      cred_t *cr)
12205 12213  {
12206 12214          int error;
12207 12215          page_t *pp;
12208 12216  
12209 12217          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
12210 12218  
12211 12219          error = nfs4_commit(vp, (offset4)offset, (count3)count, cr);
12212 12220  
12213 12221          /*
12214 12222           * If we got an error, then just unlock all of the pages
12215 12223           * on the list.
12216 12224           */
12217 12225          if (error) {
12218 12226                  while (plist != NULL) {
12219 12227                          pp = plist;
12220 12228                          page_sub(&plist, pp);
12221 12229                          page_unlock(pp);
12222 12230                  }
12223 12231                  return (error);
12224 12232          }
12225 12233          /*
12226 12234           * We've tried as hard as we can to commit the data to stable
12227 12235           * storage on the server.  We just unlock the pages and clear
12228 12236           * the commit required state.  They will get freed later.
12229 12237           */
12230 12238          while (plist != NULL) {
12231 12239                  pp = plist;
12232 12240                  page_sub(&plist, pp);
12233 12241                  pp->p_fsdata = C_NOCOMMIT;
12234 12242                  page_unlock(pp);
12235 12243          }
12236 12244  
12237 12245          return (error);
12238 12246  }
12239 12247  
12240 12248  static void
12241 12249  do_nfs4_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
12242 12250      cred_t *cr)
12243 12251  {
12244 12252  
12245 12253          (void) nfs4_sync_commit(vp, plist, offset, count, cr);
12246 12254  }
12247 12255  
12248 12256  /*ARGSUSED*/
12249 12257  static int
12250 12258  nfs4_setsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr,
12251 12259          caller_context_t *ct)
12252 12260  {
12253 12261          int             error = 0;
12254 12262          mntinfo4_t      *mi;
12255 12263          vattr_t         va;
12256 12264          vsecattr_t      nfsace4_vsap;
12257 12265  
12258 12266          mi = VTOMI4(vp);
12259 12267          if (nfs_zone() != mi->mi_zone)
12260 12268                  return (EIO);
12261 12269          if (mi->mi_flags & MI4_ACL) {
12262 12270                  /* if we have a delegation, return it */
12263 12271                  if (VTOR4(vp)->r_deleg_type != OPEN_DELEGATE_NONE)
12264 12272                          (void) nfs4delegreturn(VTOR4(vp),
12265 12273                              NFS4_DR_REOPEN|NFS4_DR_PUSH);
12266 12274  
12267 12275                  error = nfs4_is_acl_mask_valid(vsecattr->vsa_mask,
12268 12276                      NFS4_ACL_SET);
12269 12277                  if (error) /* EINVAL */
12270 12278                          return (error);
12271 12279  
12272 12280                  if (vsecattr->vsa_mask & (VSA_ACL | VSA_DFACL)) {
12273 12281                          /*
12274 12282                           * These are aclent_t type entries.
12275 12283                           */
12276 12284                          error = vs_aent_to_ace4(vsecattr, &nfsace4_vsap,
12277 12285                              vp->v_type == VDIR, FALSE);
12278 12286                          if (error)
12279 12287                                  return (error);
12280 12288                  } else {
12281 12289                          /*
12282 12290                           * These are ace_t type entries.
12283 12291                           */
12284 12292                          error = vs_acet_to_ace4(vsecattr, &nfsace4_vsap,
12285 12293                              FALSE);
12286 12294                          if (error)
12287 12295                                  return (error);
12288 12296                  }
12289 12297                  bzero(&va, sizeof (va));
12290 12298                  error = nfs4setattr(vp, &va, flag, cr, &nfsace4_vsap);
12291 12299                  vs_ace4_destroy(&nfsace4_vsap);
12292 12300                  return (error);
12293 12301          }
12294 12302          return (ENOSYS);
12295 12303  }
12296 12304  
12297 12305  /* ARGSUSED */
12298 12306  int
12299 12307  nfs4_getsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr,
12300 12308          caller_context_t *ct)
12301 12309  {
12302 12310          int             error;
12303 12311          mntinfo4_t      *mi;
12304 12312          nfs4_ga_res_t   gar;
12305 12313          rnode4_t        *rp = VTOR4(vp);
12306 12314  
12307 12315          mi = VTOMI4(vp);
12308 12316          if (nfs_zone() != mi->mi_zone)
12309 12317                  return (EIO);
12310 12318  
12311 12319          bzero(&gar, sizeof (gar));
12312 12320          gar.n4g_vsa.vsa_mask = vsecattr->vsa_mask;
12313 12321  
12314 12322          /*
12315 12323           * vsecattr->vsa_mask holds the original acl request mask.
12316 12324           * This is needed when determining what to return.
12317 12325           * (See: nfs4_create_getsecattr_return())
12318 12326           */
12319 12327          error = nfs4_is_acl_mask_valid(vsecattr->vsa_mask, NFS4_ACL_GET);
12320 12328          if (error) /* EINVAL */
12321 12329                  return (error);
12322 12330  
12323 12331          /*
12324 12332           * If this is a referral stub, don't try to go OTW for an ACL
12325 12333           */
12326 12334          if (RP_ISSTUB_REFERRAL(VTOR4(vp)))
12327 12335                  return (fs_fab_acl(vp, vsecattr, flag, cr, ct));
12328 12336  
12329 12337          if (mi->mi_flags & MI4_ACL) {
12330 12338                  /*
12331 12339                   * Check if the data is cached and the cache is valid.  If it
12332 12340                   * is we don't go over the wire.
12333 12341                   */
12334 12342                  if (rp->r_secattr != NULL && ATTRCACHE4_VALID(vp)) {
12335 12343                          mutex_enter(&rp->r_statelock);
12336 12344                          if (rp->r_secattr != NULL) {
12337 12345                                  error = nfs4_create_getsecattr_return(
12338 12346                                      rp->r_secattr, vsecattr, rp->r_attr.va_uid,
12339 12347                                      rp->r_attr.va_gid,
12340 12348                                      vp->v_type == VDIR);
12341 12349                                  if (!error) { /* error == 0 - Success! */
12342 12350                                          mutex_exit(&rp->r_statelock);
12343 12351                                          return (error);
12344 12352                                  }
12345 12353                          }
12346 12354                          mutex_exit(&rp->r_statelock);
12347 12355                  }
12348 12356  
12349 12357                  /*
12350 12358                   * The getattr otw call will always get both the acl, in
12351 12359                   * the form of a list of nfsace4's, and the number of acl
12352 12360                   * entries; independent of the value of gar.n4g_vsa.vsa_mask.
12353 12361                   */
12354 12362                  gar.n4g_va.va_mask = AT_ALL;
12355 12363                  error =  nfs4_getattr_otw(vp, &gar, cr, 1);
12356 12364                  if (error) {
12357 12365                          vs_ace4_destroy(&gar.n4g_vsa);
12358 12366                          if (error == ENOTSUP || error == EOPNOTSUPP)
12359 12367                                  error = fs_fab_acl(vp, vsecattr, flag, cr, ct);
12360 12368                          return (error);
12361 12369                  }
12362 12370  
12363 12371                  if (!(gar.n4g_resbmap & FATTR4_ACL_MASK)) {
12364 12372                          /*
12365 12373                           * No error was returned, but according to the response
12366 12374                           * bitmap, neither was an acl.
12367 12375                           */
12368 12376                          vs_ace4_destroy(&gar.n4g_vsa);
12369 12377                          error = fs_fab_acl(vp, vsecattr, flag, cr, ct);
12370 12378                          return (error);
12371 12379                  }
12372 12380  
12373 12381                  /*
12374 12382                   * Update the cache with the ACL.
12375 12383                   */
12376 12384                  nfs4_acl_fill_cache(rp, &gar.n4g_vsa);
12377 12385  
12378 12386                  error = nfs4_create_getsecattr_return(&gar.n4g_vsa,
12379 12387                      vsecattr, gar.n4g_va.va_uid, gar.n4g_va.va_gid,
12380 12388                      vp->v_type == VDIR);
12381 12389                  vs_ace4_destroy(&gar.n4g_vsa);
12382 12390                  if ((error) && (vsecattr->vsa_mask &
12383 12391                      (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT)) &&
12384 12392                      (error != EACCES)) {
12385 12393                          error = fs_fab_acl(vp, vsecattr, flag, cr, ct);
12386 12394                  }
12387 12395                  return (error);
12388 12396          }
12389 12397          error = fs_fab_acl(vp, vsecattr, flag, cr, ct);
12390 12398          return (error);
12391 12399  }
12392 12400  
12393 12401  /*
12394 12402   * The function returns:
12395 12403   *      - 0 (zero) if the passed in "acl_mask" is a valid request.
12396 12404   *      - EINVAL if the passed in "acl_mask" is an invalid request.
12397 12405   *
12398 12406   * In the case of getting an acl (op == NFS4_ACL_GET) the mask is invalid if:
12399 12407   * - We have a mixture of ACE and ACL requests (e.g. VSA_ACL | VSA_ACE)
12400 12408   *
12401 12409   * In the case of setting an acl (op == NFS4_ACL_SET) the mask is invalid if:
12402 12410   * - We have a mixture of ACE and ACL requests (e.g. VSA_ACL | VSA_ACE)
12403 12411   * - We have a count field set without the corresponding acl field set. (e.g. -
12404 12412   * VSA_ACECNT is set, but VSA_ACE is not)
12405 12413   */
12406 12414  static int
12407 12415  nfs4_is_acl_mask_valid(uint_t acl_mask, nfs4_acl_op_t op)
12408 12416  {
12409 12417          /* Shortcut the masks that are always valid. */
12410 12418          if (acl_mask == (VSA_ACE | VSA_ACECNT))
12411 12419                  return (0);
12412 12420          if (acl_mask == (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT))
12413 12421                  return (0);
12414 12422  
12415 12423          if (acl_mask & (VSA_ACE | VSA_ACECNT)) {
12416 12424                  /*
12417 12425                   * We can't have any VSA_ACL type stuff in the mask now.
12418 12426                   */
12419 12427                  if (acl_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL |
12420 12428                      VSA_DFACLCNT))
12421 12429                          return (EINVAL);
12422 12430  
12423 12431                  if (op == NFS4_ACL_SET) {
12424 12432                          if ((acl_mask & VSA_ACECNT) && !(acl_mask & VSA_ACE))
12425 12433                                  return (EINVAL);
12426 12434                  }
12427 12435          }
12428 12436  
12429 12437          if (acl_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT)) {
12430 12438                  /*
12431 12439                   * We can't have any VSA_ACE type stuff in the mask now.
12432 12440                   */
12433 12441                  if (acl_mask & (VSA_ACE | VSA_ACECNT))
12434 12442                          return (EINVAL);
12435 12443  
12436 12444                  if (op == NFS4_ACL_SET) {
12437 12445                          if ((acl_mask & VSA_ACLCNT) && !(acl_mask & VSA_ACL))
12438 12446                                  return (EINVAL);
12439 12447  
12440 12448                          if ((acl_mask & VSA_DFACLCNT) &&
12441 12449                              !(acl_mask & VSA_DFACL))
12442 12450                                  return (EINVAL);
12443 12451                  }
12444 12452          }
12445 12453          return (0);
12446 12454  }
12447 12455  
12448 12456  /*
12449 12457   * The theory behind creating the correct getsecattr return is simply this:
12450 12458   * "Don't return anything that the caller is not expecting to have to free."
12451 12459   */
12452 12460  static int
12453 12461  nfs4_create_getsecattr_return(vsecattr_t *filled_vsap, vsecattr_t *vsap,
12454 12462      uid_t uid, gid_t gid, int isdir)
12455 12463  {
12456 12464          int error = 0;
12457 12465          /* Save the mask since the translators modify it. */
12458 12466          uint_t  orig_mask = vsap->vsa_mask;
12459 12467  
12460 12468          if (orig_mask & (VSA_ACE | VSA_ACECNT)) {
12461 12469                  error = vs_ace4_to_acet(filled_vsap, vsap, uid, gid, FALSE);
12462 12470  
12463 12471                  if (error)
12464 12472                          return (error);
12465 12473  
12466 12474                  /*
12467 12475                   * If the caller only asked for the ace count (VSA_ACECNT)
12468 12476                   * don't give them the full acl (VSA_ACE), free it.
12469 12477                   */
12470 12478                  if (!orig_mask & VSA_ACE) {
12471 12479                          if (vsap->vsa_aclentp != NULL) {
12472 12480                                  kmem_free(vsap->vsa_aclentp,
12473 12481                                      vsap->vsa_aclcnt * sizeof (ace_t));
12474 12482                                  vsap->vsa_aclentp = NULL;
12475 12483                          }
12476 12484                  }
12477 12485                  vsap->vsa_mask = orig_mask;
12478 12486  
12479 12487          } else if (orig_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL |
12480 12488              VSA_DFACLCNT)) {
12481 12489                  error = vs_ace4_to_aent(filled_vsap, vsap, uid, gid,
12482 12490                      isdir, FALSE);
12483 12491  
12484 12492                  if (error)
12485 12493                          return (error);
12486 12494  
12487 12495                  /*
12488 12496                   * If the caller only asked for the acl count (VSA_ACLCNT)
12489 12497                   * and/or the default acl count (VSA_DFACLCNT) don't give them
12490 12498                   * the acl (VSA_ACL) or default acl (VSA_DFACL), free it.
12491 12499                   */
12492 12500                  if (!orig_mask & VSA_ACL) {
12493 12501                          if (vsap->vsa_aclentp != NULL) {
12494 12502                                  kmem_free(vsap->vsa_aclentp,
12495 12503                                      vsap->vsa_aclcnt * sizeof (aclent_t));
12496 12504                                  vsap->vsa_aclentp = NULL;
12497 12505                          }
12498 12506                  }
12499 12507  
12500 12508                  if (!orig_mask & VSA_DFACL) {
12501 12509                          if (vsap->vsa_dfaclentp != NULL) {
12502 12510                                  kmem_free(vsap->vsa_dfaclentp,
12503 12511                                      vsap->vsa_dfaclcnt * sizeof (aclent_t));
12504 12512                                  vsap->vsa_dfaclentp = NULL;
12505 12513                          }
12506 12514                  }
12507 12515                  vsap->vsa_mask = orig_mask;
12508 12516          }
12509 12517          return (0);
12510 12518  }
12511 12519  
12512 12520  /* ARGSUSED */
12513 12521  int
12514 12522  nfs4_shrlock(vnode_t *vp, int cmd, struct shrlock *shr, int flag, cred_t *cr,
12515 12523      caller_context_t *ct)
12516 12524  {
12517 12525          int error;
12518 12526  
12519 12527          if (nfs_zone() != VTOMI4(vp)->mi_zone)
12520 12528                  return (EIO);
12521 12529          /*
12522 12530           * check for valid cmd parameter
12523 12531           */
12524 12532          if (cmd != F_SHARE && cmd != F_UNSHARE && cmd != F_HASREMOTELOCKS)
12525 12533                  return (EINVAL);
12526 12534  
12527 12535          /*
12528 12536           * Check access permissions
12529 12537           */
12530 12538          if ((cmd & F_SHARE) &&
12531 12539              (((shr->s_access & F_RDACC) && (flag & FREAD) == 0) ||
12532 12540              (shr->s_access == F_WRACC && (flag & FWRITE) == 0)))
12533 12541                  return (EBADF);
12534 12542  
12535 12543          /*
12536 12544           * If the filesystem is mounted using local locking, pass the
12537 12545           * request off to the local share code.
12538 12546           */
12539 12547          if (VTOMI4(vp)->mi_flags & MI4_LLOCK)
12540 12548                  return (fs_shrlock(vp, cmd, shr, flag, cr, ct));
12541 12549  
12542 12550          switch (cmd) {
12543 12551          case F_SHARE:
12544 12552          case F_UNSHARE:
12545 12553                  /*
12546 12554                   * This will be properly implemented later,
12547 12555                   * see RFE: 4823948 .
12548 12556                   */
12549 12557                  error = EAGAIN;
12550 12558                  break;
12551 12559  
12552 12560          case F_HASREMOTELOCKS:
12553 12561                  /*
12554 12562                   * NFS client can't store remote locks itself
12555 12563                   */
12556 12564                  shr->s_access = 0;
12557 12565                  error = 0;
12558 12566                  break;
12559 12567  
12560 12568          default:
12561 12569                  error = EINVAL;
12562 12570                  break;
12563 12571          }
12564 12572  
12565 12573          return (error);
12566 12574  }
12567 12575  
12568 12576  /*
12569 12577   * Common code called by directory ops to update the attrcache
12570 12578   */
12571 12579  static int
12572 12580  nfs4_update_attrcache(nfsstat4 status, nfs4_ga_res_t *garp,
12573 12581      hrtime_t t, vnode_t *vp, cred_t *cr)
12574 12582  {
12575 12583          int error = 0;
12576 12584  
12577 12585          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
12578 12586  
12579 12587          if (status != NFS4_OK) {
12580 12588                  /* getattr not done or failed */
12581 12589                  PURGE_ATTRCACHE4(vp);
12582 12590                  return (error);
12583 12591          }
12584 12592  
12585 12593          if (garp) {
12586 12594                  nfs4_attr_cache(vp, garp, t, cr, FALSE, NULL);
12587 12595          } else {
12588 12596                  PURGE_ATTRCACHE4(vp);
12589 12597          }
12590 12598          return (error);
12591 12599  }
12592 12600  
12593 12601  /*
12594 12602   * Update directory caches for directory modification ops (link, rename, etc.)
12595 12603   * When dinfo is NULL, manage dircaches in the old way.
12596 12604   */
12597 12605  static void
12598 12606  nfs4_update_dircaches(change_info4 *cinfo, vnode_t *dvp, vnode_t *vp, char *nm,
12599 12607      dirattr_info_t *dinfo)
12600 12608  {
12601 12609          rnode4_t        *drp = VTOR4(dvp);
12602 12610  
12603 12611          ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
12604 12612  
12605 12613          /* Purge rddir cache for dir since it changed */
12606 12614          if (drp->r_dir != NULL)
12607 12615                  nfs4_purge_rddir_cache(dvp);
12608 12616  
12609 12617          /*
12610 12618           * If caller provided dinfo, then use it to manage dir caches.
12611 12619           */
12612 12620          if (dinfo != NULL) {
12613 12621                  if (vp != NULL) {
12614 12622                          mutex_enter(&VTOR4(vp)->r_statev4_lock);
12615 12623                          if (!VTOR4(vp)->created_v4) {
12616 12624                                  mutex_exit(&VTOR4(vp)->r_statev4_lock);
12617 12625                                  dnlc_update(dvp, nm, vp);
12618 12626                          } else {
12619 12627                                  /*
12620 12628                                   * XXX don't update if the created_v4 flag is
12621 12629                                   * set
12622 12630                                   */
12623 12631                                  mutex_exit(&VTOR4(vp)->r_statev4_lock);
12624 12632                                  NFS4_DEBUG(nfs4_client_state_debug,
12625 12633                                      (CE_NOTE, "nfs4_update_dircaches: "
12626 12634                                      "don't update dnlc: created_v4 flag"));
12627 12635                          }
12628 12636                  }
12629 12637  
12630 12638                  nfs4_attr_cache(dvp, dinfo->di_garp, dinfo->di_time_call,
12631 12639                      dinfo->di_cred, FALSE, cinfo);
12632 12640  
12633 12641                  return;
12634 12642          }
12635 12643  
12636 12644          /*
12637 12645           * Caller didn't provide dinfo, then check change_info4 to update DNLC.
12638 12646           * Since caller modified dir but didn't receive post-dirmod-op dir
12639 12647           * attrs, the dir's attrs must be purged.
12640 12648           *
12641 12649           * XXX this check and dnlc update/purge should really be atomic,
12642 12650           * XXX but can't use rnode statelock because it'll deadlock in
12643 12651           * XXX dnlc_purge_vp, however, the risk is minimal even if a race
12644 12652           * XXX does occur.
12645 12653           *
12646 12654           * XXX We also may want to check that atomic is true in the
12647 12655           * XXX change_info struct. If it is not, the change_info may
12648 12656           * XXX reflect changes by more than one clients which means that
12649 12657           * XXX our cache may not be valid.
12650 12658           */
12651 12659          PURGE_ATTRCACHE4(dvp);
12652 12660          if (drp->r_change == cinfo->before) {
12653 12661                  /* no changes took place in the directory prior to our link */
12654 12662                  if (vp != NULL) {
12655 12663                          mutex_enter(&VTOR4(vp)->r_statev4_lock);
12656 12664                          if (!VTOR4(vp)->created_v4) {
12657 12665                                  mutex_exit(&VTOR4(vp)->r_statev4_lock);
12658 12666                                  dnlc_update(dvp, nm, vp);
12659 12667                          } else {
12660 12668                                  /*
12661 12669                                   * XXX dont' update if the created_v4 flag
12662 12670                                   * is set
12663 12671                                   */
12664 12672                                  mutex_exit(&VTOR4(vp)->r_statev4_lock);
12665 12673                                  NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
12666 12674                                      "nfs4_update_dircaches: don't"
12667 12675                                      " update dnlc: created_v4 flag"));
12668 12676                          }
12669 12677                  }
12670 12678          } else {
12671 12679                  /* Another client modified directory - purge its dnlc cache */
12672 12680                  dnlc_purge_vp(dvp);
12673 12681          }
12674 12682  }
12675 12683  
12676 12684  /*
12677 12685   * The OPEN_CONFIRM operation confirms the sequence number used in OPENing a
12678 12686   * file.
12679 12687   *
12680 12688   * The 'reopening_file' boolean should be set to TRUE if we are reopening this
12681 12689   * file (ie: client recovery) and otherwise set to FALSE.
12682 12690   *
12683 12691   * 'nfs4_start/end_op' should have been called by the proper (ie: not recovery
12684 12692   * initiated) calling functions.
12685 12693   *
12686 12694   * 'resend' is set to TRUE if this is a OPEN_CONFIRM issued as a result
12687 12695   * of resending a 'lost' open request.
12688 12696   *
12689 12697   * 'num_bseqid_retryp' makes sure we don't loop forever on a broken
12690 12698   * server that hands out BAD_SEQID on open confirm.
12691 12699   *
12692 12700   * Errors are returned via the nfs4_error_t parameter.
12693 12701   */
12694 12702  void
12695 12703  nfs4open_confirm(vnode_t *vp, seqid4 *seqid, stateid4 *stateid, cred_t *cr,
12696 12704      bool_t reopening_file, bool_t *retry_open, nfs4_open_owner_t *oop,
12697 12705      bool_t resend, nfs4_error_t *ep, int *num_bseqid_retryp)
12698 12706  {
12699 12707          COMPOUND4args_clnt args;
12700 12708          COMPOUND4res_clnt res;
12701 12709          nfs_argop4 argop[2];
12702 12710          nfs_resop4 *resop;
12703 12711          int doqueue = 1;
12704 12712          mntinfo4_t *mi;
12705 12713          OPEN_CONFIRM4args *open_confirm_args;
12706 12714          int needrecov;
12707 12715  
12708 12716          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
12709 12717  #if DEBUG
12710 12718          mutex_enter(&oop->oo_lock);
12711 12719          ASSERT(oop->oo_seqid_inuse);
12712 12720          mutex_exit(&oop->oo_lock);
12713 12721  #endif
12714 12722  
12715 12723  recov_retry_confirm:
12716 12724          nfs4_error_zinit(ep);
12717 12725          *retry_open = FALSE;
12718 12726  
12719 12727          if (resend)
12720 12728                  args.ctag = TAG_OPEN_CONFIRM_LOST;
12721 12729          else
12722 12730                  args.ctag = TAG_OPEN_CONFIRM;
12723 12731  
12724 12732          args.array_len = 2;
12725 12733          args.array = argop;
12726 12734  
12727 12735          /* putfh target fh */
12728 12736          argop[0].argop = OP_CPUTFH;
12729 12737          argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh;
12730 12738  
12731 12739          argop[1].argop = OP_OPEN_CONFIRM;
12732 12740          open_confirm_args = &argop[1].nfs_argop4_u.opopen_confirm;
12733 12741  
12734 12742          (*seqid) += 1;
12735 12743          open_confirm_args->seqid = *seqid;
12736 12744          open_confirm_args->open_stateid = *stateid;
12737 12745  
12738 12746          mi = VTOMI4(vp);
12739 12747  
12740 12748          rfs4call(mi, &args, &res, cr, &doqueue, 0, ep);
12741 12749  
12742 12750          if (!ep->error && nfs4_need_to_bump_seqid(&res)) {
12743 12751                  nfs4_set_open_seqid((*seqid), oop, args.ctag);
12744 12752          }
12745 12753  
12746 12754          needrecov = nfs4_needs_recovery(ep, FALSE, mi->mi_vfsp);
12747 12755          if (!needrecov && ep->error)
12748 12756                  return;
12749 12757  
12750 12758          if (needrecov) {
12751 12759                  bool_t abort = FALSE;
12752 12760  
12753 12761                  if (reopening_file == FALSE) {
12754 12762                          nfs4_bseqid_entry_t *bsep = NULL;
12755 12763  
12756 12764                          if (!ep->error && res.status == NFS4ERR_BAD_SEQID)
12757 12765                                  bsep = nfs4_create_bseqid_entry(oop, NULL,
12758 12766                                      vp, 0, args.ctag,
12759 12767                                      open_confirm_args->seqid);
12760 12768  
12761 12769                          abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL,
12762 12770                              NULL, NULL, OP_OPEN_CONFIRM, bsep, NULL, NULL);
12763 12771                          if (bsep) {
12764 12772                                  kmem_free(bsep, sizeof (*bsep));
12765 12773                                  if (num_bseqid_retryp &&
12766 12774                                      --(*num_bseqid_retryp) == 0)
12767 12775                                          abort = TRUE;
12768 12776                          }
12769 12777                  }
12770 12778                  if ((ep->error == ETIMEDOUT ||
12771 12779                      res.status == NFS4ERR_RESOURCE) &&
12772 12780                      abort == FALSE && resend == FALSE) {
12773 12781                          if (!ep->error)
12774 12782                                  (void) xdr_free(xdr_COMPOUND4res_clnt,
12775 12783                                      (caddr_t)&res);
12776 12784  
12777 12785                          delay(SEC_TO_TICK(confirm_retry_sec));
12778 12786                          goto recov_retry_confirm;
12779 12787                  }
12780 12788                  /* State may have changed so retry the entire OPEN op */
12781 12789                  if (abort == FALSE)
12782 12790                          *retry_open = TRUE;
12783 12791                  else
12784 12792                          *retry_open = FALSE;
12785 12793                  if (!ep->error)
12786 12794                          (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
12787 12795                  return;
12788 12796          }
12789 12797  
12790 12798          if (res.status) {
12791 12799                  (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
12792 12800                  return;
12793 12801          }
12794 12802  
12795 12803          resop = &res.array[1];  /* open confirm res */
12796 12804          bcopy(&resop->nfs_resop4_u.opopen_confirm.open_stateid,
12797 12805              stateid, sizeof (*stateid));
12798 12806  
12799 12807          (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
12800 12808  }
12801 12809  
12802 12810  /*
12803 12811   * Return the credentials associated with a client state object.  The
12804 12812   * caller is responsible for freeing the credentials.
12805 12813   */
12806 12814  
12807 12815  static cred_t *
12808 12816  state_to_cred(nfs4_open_stream_t *osp)
12809 12817  {
12810 12818          cred_t *cr;
12811 12819  
12812 12820          /*
12813 12821           * It's ok to not lock the open stream and open owner to get
12814 12822           * the oo_cred since this is only written once (upon creation)
12815 12823           * and will not change.
12816 12824           */
12817 12825          cr = osp->os_open_owner->oo_cred;
12818 12826          crhold(cr);
12819 12827  
12820 12828          return (cr);
12821 12829  }
12822 12830  
12823 12831  /*
12824 12832   * nfs4_find_sysid
12825 12833   *
12826 12834   * Find the sysid for the knetconfig associated with the given mi.
12827 12835   */
12828 12836  static struct lm_sysid *
12829 12837  nfs4_find_sysid(mntinfo4_t *mi)
12830 12838  {
12831 12839          ASSERT(nfs_zone() == mi->mi_zone);
12832 12840  
12833 12841          /*
12834 12842           * Switch from RDMA knconf to original mount knconf
12835 12843           */
12836 12844          return (lm_get_sysid(ORIG_KNCONF(mi), &mi->mi_curr_serv->sv_addr,
12837 12845              mi->mi_curr_serv->sv_hostname, NULL));
12838 12846  }
12839 12847  
12840 12848  #ifdef DEBUG
12841 12849  /*
12842 12850   * Return a string version of the call type for easy reading.
12843 12851   */
12844 12852  static char *
12845 12853  nfs4frlock_get_call_type(nfs4_lock_call_type_t ctype)
12846 12854  {
12847 12855          switch (ctype) {
12848 12856          case NFS4_LCK_CTYPE_NORM:
12849 12857                  return ("NORMAL");
12850 12858          case NFS4_LCK_CTYPE_RECLAIM:
12851 12859                  return ("RECLAIM");
12852 12860          case NFS4_LCK_CTYPE_RESEND:
12853 12861                  return ("RESEND");
12854 12862          case NFS4_LCK_CTYPE_REINSTATE:
12855 12863                  return ("REINSTATE");
12856 12864          default:
12857 12865                  cmn_err(CE_PANIC, "nfs4frlock_get_call_type: got illegal "
12858 12866                      "type %d", ctype);
12859 12867                  return ("");
12860 12868          }
12861 12869  }
12862 12870  #endif
12863 12871  
12864 12872  /*
12865 12873   * Map the frlock cmd and lock type to the NFSv4 over-the-wire lock type
12866 12874   * Unlock requests don't have an over-the-wire locktype, so we just return
12867 12875   * something non-threatening.
12868 12876   */
12869 12877  
12870 12878  static nfs_lock_type4
12871 12879  flk_to_locktype(int cmd, int l_type)
12872 12880  {
12873 12881          ASSERT(l_type == F_RDLCK || l_type == F_WRLCK || l_type == F_UNLCK);
12874 12882  
12875 12883          switch (l_type) {
12876 12884          case F_UNLCK:
12877 12885                  return (READ_LT);
12878 12886          case F_RDLCK:
12879 12887                  if (cmd == F_SETLK)
12880 12888                          return (READ_LT);
12881 12889                  else
12882 12890                          return (READW_LT);
12883 12891          case F_WRLCK:
12884 12892                  if (cmd == F_SETLK)
12885 12893                          return (WRITE_LT);
12886 12894                  else
12887 12895                          return (WRITEW_LT);
12888 12896          }
12889 12897          panic("flk_to_locktype");
12890 12898          /*NOTREACHED*/
12891 12899  }
12892 12900  
12893 12901  /*
12894 12902   * Do some preliminary checks for nfs4frlock.
12895 12903   */
12896 12904  static int
12897 12905  nfs4frlock_validate_args(int cmd, flock64_t *flk, int flag, vnode_t *vp,
12898 12906      u_offset_t offset)
12899 12907  {
12900 12908          int error = 0;
12901 12909  
12902 12910          /*
12903 12911           * If we are setting a lock, check that the file is opened
12904 12912           * with the correct mode.
12905 12913           */
12906 12914          if (cmd == F_SETLK || cmd == F_SETLKW) {
12907 12915                  if ((flk->l_type == F_RDLCK && (flag & FREAD) == 0) ||
12908 12916                      (flk->l_type == F_WRLCK && (flag & FWRITE) == 0)) {
12909 12917                          NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
12910 12918                              "nfs4frlock_validate_args: file was opened with "
12911 12919                              "incorrect mode"));
12912 12920                          return (EBADF);
12913 12921                  }
12914 12922          }
12915 12923  
12916 12924          /* Convert the offset. It may need to be restored before returning. */
12917 12925          if (error = convoff(vp, flk, 0, offset)) {
12918 12926                  NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
12919 12927                      "nfs4frlock_validate_args: convoff  =>  error= %d\n",
12920 12928                      error));
12921 12929                  return (error);
12922 12930          }
12923 12931  
12924 12932          return (error);
12925 12933  }
12926 12934  
12927 12935  /*
12928 12936   * Set the flock64's lm_sysid for nfs4frlock.
12929 12937   */
12930 12938  static int
12931 12939  nfs4frlock_get_sysid(struct lm_sysid **lspp, vnode_t *vp, flock64_t *flk)
12932 12940  {
12933 12941          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
12934 12942  
12935 12943          /* Find the lm_sysid */
12936 12944          *lspp = nfs4_find_sysid(VTOMI4(vp));
12937 12945  
12938 12946          if (*lspp == NULL) {
12939 12947                  NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
12940 12948                      "nfs4frlock_get_sysid: no sysid, return ENOLCK"));
12941 12949                  return (ENOLCK);
12942 12950          }
12943 12951  
12944 12952          flk->l_sysid = lm_sysidt(*lspp);
12945 12953  
12946 12954          return (0);
12947 12955  }
12948 12956  
12949 12957  /*
12950 12958   * Do the remaining preliminary setup for nfs4frlock.
12951 12959   */
12952 12960  static void
12953 12961  nfs4frlock_pre_setup(clock_t *tick_delayp, nfs4_recov_state_t *recov_statep,
12954 12962      flock64_t *flk, short *whencep, vnode_t *vp, cred_t *search_cr,
12955 12963      cred_t **cred_otw)
12956 12964  {
12957 12965          /*
12958 12966           * set tick_delay to the base delay time.
12959 12967           * (NFS4_BASE_WAIT_TIME is in secs)
12960 12968           */
12961 12969  
12962 12970          *tick_delayp = drv_usectohz(NFS4_BASE_WAIT_TIME * 1000 * 1000);
12963 12971  
12964 12972          /*
12965 12973           * If lock is relative to EOF, we need the newest length of the
12966 12974           * file. Therefore invalidate the ATTR_CACHE.
12967 12975           */
12968 12976  
12969 12977          *whencep = flk->l_whence;
12970 12978  
12971 12979          if (*whencep == 2)              /* SEEK_END */
12972 12980                  PURGE_ATTRCACHE4(vp);
12973 12981  
12974 12982          recov_statep->rs_flags = 0;
12975 12983          recov_statep->rs_num_retry_despite_err = 0;
12976 12984          *cred_otw = nfs4_get_otw_cred(search_cr, VTOMI4(vp), NULL);
12977 12985  }
12978 12986  
12979 12987  /*
12980 12988   * Initialize and allocate the data structures necessary for
12981 12989   * the nfs4frlock call.
12982 12990   * Allocates argsp's op array, frees up the saved_rqstpp if there is one.
12983 12991   */
12984 12992  static void
12985 12993  nfs4frlock_call_init(COMPOUND4args_clnt *argsp, COMPOUND4args_clnt **argspp,
12986 12994      nfs_argop4 **argopp, nfs4_op_hint_t *op_hintp, flock64_t *flk, int cmd,
12987 12995      bool_t *retry, bool_t *did_start_fop, COMPOUND4res_clnt **respp,
12988 12996      bool_t *skip_get_err, nfs4_lost_rqst_t *lost_rqstp)
12989 12997  {
12990 12998          int             argoplist_size;
12991 12999          int             num_ops = 2;
12992 13000  
12993 13001          *retry = FALSE;
12994 13002          *did_start_fop = FALSE;
12995 13003          *skip_get_err = FALSE;
12996 13004          lost_rqstp->lr_op = 0;
12997 13005          argoplist_size  = num_ops * sizeof (nfs_argop4);
12998 13006          /* fill array with zero */
12999 13007          *argopp = kmem_zalloc(argoplist_size, KM_SLEEP);
13000 13008  
13001 13009          *argspp = argsp;
13002 13010          *respp = NULL;
13003 13011  
13004 13012          argsp->array_len = num_ops;
13005 13013          argsp->array = *argopp;
13006 13014  
13007 13015          /* initialize in case of error; will get real value down below */
13008 13016          argsp->ctag = TAG_NONE;
13009 13017  
13010 13018          if ((cmd == F_SETLK || cmd == F_SETLKW) && flk->l_type == F_UNLCK)
13011 13019                  *op_hintp = OH_LOCKU;
13012 13020          else
13013 13021                  *op_hintp = OH_OTHER;
13014 13022  }
13015 13023  
13016 13024  /*
13017 13025   * Call the nfs4_start_fop() for nfs4frlock, if necessary.  Assign
13018 13026   * the proper nfs4_server_t for this instance of nfs4frlock.
13019 13027   * Returns 0 (success) or an errno value.
13020 13028   */
13021 13029  static int
13022 13030  nfs4frlock_start_call(nfs4_lock_call_type_t ctype, vnode_t *vp,
13023 13031      nfs4_op_hint_t op_hint, nfs4_recov_state_t *recov_statep,
13024 13032      bool_t *did_start_fop, bool_t *startrecovp)
13025 13033  {
13026 13034          int error = 0;
13027 13035          rnode4_t *rp;
13028 13036  
13029 13037          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13030 13038  
13031 13039          if (ctype == NFS4_LCK_CTYPE_NORM) {
13032 13040                  error = nfs4_start_fop(VTOMI4(vp), vp, NULL, op_hint,
13033 13041                      recov_statep, startrecovp);
13034 13042                  if (error)
13035 13043                          return (error);
13036 13044                  *did_start_fop = TRUE;
13037 13045          } else {
13038 13046                  *did_start_fop = FALSE;
13039 13047                  *startrecovp = FALSE;
13040 13048          }
13041 13049  
13042 13050          if (!error) {
13043 13051                  rp = VTOR4(vp);
13044 13052  
13045 13053                  /* If the file failed recovery, just quit. */
13046 13054                  mutex_enter(&rp->r_statelock);
13047 13055                  if (rp->r_flags & R4RECOVERR) {
13048 13056                          error = EIO;
13049 13057                  }
13050 13058                  mutex_exit(&rp->r_statelock);
13051 13059          }
13052 13060  
13053 13061          return (error);
13054 13062  }
13055 13063  
13056 13064  /*
13057 13065   * Setup the LOCK4/LOCKU4 arguments for resending a lost lock request.  A
13058 13066   * resend nfs4frlock call is initiated by the recovery framework.
13059 13067   * Acquires the lop and oop seqid synchronization.
13060 13068   */
13061 13069  static void
13062 13070  nfs4frlock_setup_resend_lock_args(nfs4_lost_rqst_t *resend_rqstp,
13063 13071      COMPOUND4args_clnt *argsp, nfs_argop4 *argop, nfs4_lock_owner_t **lopp,
13064 13072      nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp,
13065 13073      LOCK4args **lock_argsp, LOCKU4args **locku_argsp)
13066 13074  {
13067 13075          mntinfo4_t *mi = VTOMI4(resend_rqstp->lr_vp);
13068 13076          int error;
13069 13077  
13070 13078          NFS4_DEBUG((nfs4_lost_rqst_debug || nfs4_client_lock_debug),
13071 13079              (CE_NOTE,
13072 13080              "nfs4frlock_setup_resend_lock_args: have lost lock to resend"));
13073 13081          ASSERT(resend_rqstp != NULL);
13074 13082          ASSERT(resend_rqstp->lr_op == OP_LOCK ||
13075 13083              resend_rqstp->lr_op == OP_LOCKU);
13076 13084  
13077 13085          *oopp = resend_rqstp->lr_oop;
13078 13086          if (resend_rqstp->lr_oop) {
13079 13087                  open_owner_hold(resend_rqstp->lr_oop);
13080 13088                  error = nfs4_start_open_seqid_sync(resend_rqstp->lr_oop, mi);
13081 13089                  ASSERT(error == 0);     /* recov thread always succeeds */
13082 13090          }
13083 13091  
13084 13092          /* Must resend this lost lock/locku request. */
13085 13093          ASSERT(resend_rqstp->lr_lop != NULL);
13086 13094          *lopp = resend_rqstp->lr_lop;
13087 13095          lock_owner_hold(resend_rqstp->lr_lop);
13088 13096          error = nfs4_start_lock_seqid_sync(resend_rqstp->lr_lop, mi);
13089 13097          ASSERT(error == 0);     /* recov thread always succeeds */
13090 13098  
13091 13099          *ospp = resend_rqstp->lr_osp;
13092 13100          if (*ospp)
13093 13101                  open_stream_hold(resend_rqstp->lr_osp);
13094 13102  
13095 13103          if (resend_rqstp->lr_op == OP_LOCK) {
13096 13104                  LOCK4args *lock_args;
13097 13105  
13098 13106                  argop->argop = OP_LOCK;
13099 13107                  *lock_argsp = lock_args = &argop->nfs_argop4_u.oplock;
13100 13108                  lock_args->locktype = resend_rqstp->lr_locktype;
13101 13109                  lock_args->reclaim =
13102 13110                      (resend_rqstp->lr_ctype == NFS4_LCK_CTYPE_RECLAIM);
13103 13111                  lock_args->offset = resend_rqstp->lr_flk->l_start;
13104 13112                  lock_args->length = resend_rqstp->lr_flk->l_len;
13105 13113                  if (lock_args->length == 0)
13106 13114                          lock_args->length = ~lock_args->length;
13107 13115                  nfs4_setup_lock_args(*lopp, *oopp, *ospp,
13108 13116                      mi2clientid(mi), &lock_args->locker);
13109 13117  
13110 13118                  switch (resend_rqstp->lr_ctype) {
13111 13119                  case NFS4_LCK_CTYPE_RESEND:
13112 13120                          argsp->ctag = TAG_LOCK_RESEND;
13113 13121                          break;
13114 13122                  case NFS4_LCK_CTYPE_REINSTATE:
13115 13123                          argsp->ctag = TAG_LOCK_REINSTATE;
13116 13124                          break;
13117 13125                  case NFS4_LCK_CTYPE_RECLAIM:
13118 13126                          argsp->ctag = TAG_LOCK_RECLAIM;
13119 13127                          break;
13120 13128                  default:
13121 13129                          argsp->ctag = TAG_LOCK_UNKNOWN;
13122 13130                          break;
13123 13131                  }
13124 13132          } else {
13125 13133                  LOCKU4args *locku_args;
13126 13134                  nfs4_lock_owner_t *lop = resend_rqstp->lr_lop;
13127 13135  
13128 13136                  argop->argop = OP_LOCKU;
13129 13137                  *locku_argsp = locku_args = &argop->nfs_argop4_u.oplocku;
13130 13138                  locku_args->locktype = READ_LT;
13131 13139                  locku_args->seqid = lop->lock_seqid + 1;
13132 13140                  mutex_enter(&lop->lo_lock);
13133 13141                  locku_args->lock_stateid = lop->lock_stateid;
13134 13142                  mutex_exit(&lop->lo_lock);
13135 13143                  locku_args->offset = resend_rqstp->lr_flk->l_start;
13136 13144                  locku_args->length = resend_rqstp->lr_flk->l_len;
13137 13145                  if (locku_args->length == 0)
13138 13146                          locku_args->length = ~locku_args->length;
13139 13147  
13140 13148                  switch (resend_rqstp->lr_ctype) {
13141 13149                  case NFS4_LCK_CTYPE_RESEND:
13142 13150                          argsp->ctag = TAG_LOCKU_RESEND;
13143 13151                          break;
13144 13152                  case NFS4_LCK_CTYPE_REINSTATE:
13145 13153                          argsp->ctag = TAG_LOCKU_REINSTATE;
13146 13154                          break;
13147 13155                  default:
13148 13156                          argsp->ctag = TAG_LOCK_UNKNOWN;
13149 13157                          break;
13150 13158                  }
13151 13159          }
13152 13160  }
13153 13161  
13154 13162  /*
13155 13163   * Setup the LOCKT4 arguments.
13156 13164   */
13157 13165  static void
13158 13166  nfs4frlock_setup_lockt_args(nfs4_lock_call_type_t ctype, nfs_argop4 *argop,
13159 13167      LOCKT4args **lockt_argsp, COMPOUND4args_clnt *argsp, flock64_t *flk,
13160 13168      rnode4_t *rp)
13161 13169  {
13162 13170          LOCKT4args *lockt_args;
13163 13171  
13164 13172          ASSERT(nfs_zone() == VTOMI4(RTOV4(rp))->mi_zone);
13165 13173          ASSERT(ctype == NFS4_LCK_CTYPE_NORM);
13166 13174          argop->argop = OP_LOCKT;
13167 13175          argsp->ctag = TAG_LOCKT;
13168 13176          lockt_args = &argop->nfs_argop4_u.oplockt;
13169 13177  
13170 13178          /*
13171 13179           * The locktype will be READ_LT unless it's
13172 13180           * a write lock. We do this because the Solaris
13173 13181           * system call allows the combination of
13174 13182           * F_UNLCK and F_GETLK* and so in that case the
13175 13183           * unlock is mapped to a read.
13176 13184           */
13177 13185          if (flk->l_type == F_WRLCK)
13178 13186                  lockt_args->locktype = WRITE_LT;
13179 13187          else
13180 13188                  lockt_args->locktype = READ_LT;
13181 13189  
13182 13190          lockt_args->owner.clientid = mi2clientid(VTOMI4(RTOV4(rp)));
13183 13191          /* set the lock owner4 args */
13184 13192          nfs4_setlockowner_args(&lockt_args->owner, rp,
13185 13193              ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pidp->pid_id :
13186 13194              flk->l_pid);
13187 13195          lockt_args->offset = flk->l_start;
13188 13196          lockt_args->length = flk->l_len;
13189 13197          if (flk->l_len == 0)
13190 13198                  lockt_args->length = ~lockt_args->length;
13191 13199  
13192 13200          *lockt_argsp = lockt_args;
13193 13201  }
13194 13202  
13195 13203  /*
13196 13204   * If the client is holding a delegation, and the open stream to be used
13197 13205   * with this lock request is a delegation open stream, then re-open the stream.
13198 13206   * Sets the nfs4_error_t to all zeros unless the open stream has already
13199 13207   * failed a reopen or we couldn't find the open stream.  NFS4ERR_DELAY
13200 13208   * means the caller should retry (like a recovery retry).
13201 13209   */
13202 13210  static void
13203 13211  nfs4frlock_check_deleg(vnode_t *vp, nfs4_error_t *ep, cred_t *cr, int lt)
13204 13212  {
13205 13213          open_delegation_type4   dt;
13206 13214          bool_t                  reopen_needed, force;
13207 13215          nfs4_open_stream_t      *osp;
13208 13216          open_claim_type4        oclaim;
13209 13217          rnode4_t                *rp = VTOR4(vp);
13210 13218          mntinfo4_t              *mi = VTOMI4(vp);
13211 13219  
13212 13220          ASSERT(nfs_zone() == mi->mi_zone);
13213 13221  
13214 13222          nfs4_error_zinit(ep);
13215 13223  
13216 13224          mutex_enter(&rp->r_statev4_lock);
13217 13225          dt = rp->r_deleg_type;
13218 13226          mutex_exit(&rp->r_statev4_lock);
13219 13227  
13220 13228          if (dt != OPEN_DELEGATE_NONE) {
13221 13229                  nfs4_open_owner_t       *oop;
13222 13230  
13223 13231                  oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
13224 13232                  if (!oop) {
13225 13233                          ep->stat = NFS4ERR_IO;
13226 13234                          return;
13227 13235                  }
13228 13236                  /* returns with 'os_sync_lock' held */
13229 13237                  osp = find_open_stream(oop, rp);
13230 13238                  if (!osp) {
13231 13239                          open_owner_rele(oop);
13232 13240                          ep->stat = NFS4ERR_IO;
13233 13241                          return;
13234 13242                  }
13235 13243  
13236 13244                  if (osp->os_failed_reopen) {
13237 13245                          NFS4_DEBUG((nfs4_open_stream_debug ||
13238 13246                              nfs4_client_lock_debug), (CE_NOTE,
13239 13247                              "nfs4frlock_check_deleg: os_failed_reopen set "
13240 13248                              "for osp %p, cr %p, rp %s", (void *)osp,
13241 13249                              (void *)cr, rnode4info(rp)));
13242 13250                          mutex_exit(&osp->os_sync_lock);
13243 13251                          open_stream_rele(osp, rp);
13244 13252                          open_owner_rele(oop);
13245 13253                          ep->stat = NFS4ERR_IO;
13246 13254                          return;
13247 13255                  }
13248 13256  
13249 13257                  /*
13250 13258                   * Determine whether a reopen is needed.  If this
13251 13259                   * is a delegation open stream, then send the open
13252 13260                   * to the server to give visibility to the open owner.
13253 13261                   * Even if it isn't a delegation open stream, we need
13254 13262                   * to check if the previous open CLAIM_DELEGATE_CUR
13255 13263                   * was sufficient.
13256 13264                   */
13257 13265  
13258 13266                  reopen_needed = osp->os_delegation ||
13259 13267                      ((lt == F_RDLCK &&
13260 13268                      !(osp->os_dc_openacc & OPEN4_SHARE_ACCESS_READ)) ||
13261 13269                      (lt == F_WRLCK &&
13262 13270                      !(osp->os_dc_openacc & OPEN4_SHARE_ACCESS_WRITE)));
13263 13271  
13264 13272                  mutex_exit(&osp->os_sync_lock);
13265 13273                  open_owner_rele(oop);
13266 13274  
13267 13275                  if (reopen_needed) {
13268 13276                          /*
13269 13277                           * Always use CLAIM_PREVIOUS after server reboot.
13270 13278                           * The server will reject CLAIM_DELEGATE_CUR if
13271 13279                           * it is used during the grace period.
13272 13280                           */
13273 13281                          mutex_enter(&mi->mi_lock);
13274 13282                          if (mi->mi_recovflags & MI4R_SRV_REBOOT) {
13275 13283                                  oclaim = CLAIM_PREVIOUS;
13276 13284                                  force = TRUE;
13277 13285                          } else {
13278 13286                                  oclaim = CLAIM_DELEGATE_CUR;
13279 13287                                  force = FALSE;
13280 13288                          }
13281 13289                          mutex_exit(&mi->mi_lock);
13282 13290  
13283 13291                          nfs4_reopen(vp, osp, ep, oclaim, force, FALSE);
13284 13292                          if (ep->error == EAGAIN) {
13285 13293                                  nfs4_error_zinit(ep);
13286 13294                                  ep->stat = NFS4ERR_DELAY;
13287 13295                          }
13288 13296                  }
13289 13297                  open_stream_rele(osp, rp);
13290 13298                  osp = NULL;
13291 13299          }
13292 13300  }
13293 13301  
13294 13302  /*
13295 13303   * Setup the LOCKU4 arguments.
13296 13304   * Returns errors via the nfs4_error_t.
13297 13305   * NFS4_OK              no problems.  *go_otwp is TRUE if call should go
13298 13306   *                      over-the-wire.  The caller must release the
13299 13307   *                      reference on *lopp.
13300 13308   * NFS4ERR_DELAY        caller should retry (like recovery retry)
13301 13309   * (other)              unrecoverable error.
13302 13310   */
13303 13311  static void
13304 13312  nfs4frlock_setup_locku_args(nfs4_lock_call_type_t ctype, nfs_argop4 *argop,
13305 13313      LOCKU4args **locku_argsp, flock64_t *flk,
13306 13314      nfs4_lock_owner_t **lopp, nfs4_error_t *ep, COMPOUND4args_clnt *argsp,
13307 13315      vnode_t *vp, int flag, u_offset_t offset, cred_t *cr,
13308 13316      bool_t *skip_get_err, bool_t *go_otwp)
13309 13317  {
13310 13318          nfs4_lock_owner_t       *lop = NULL;
13311 13319          LOCKU4args              *locku_args;
13312 13320          pid_t                   pid;
13313 13321          bool_t                  is_spec = FALSE;
13314 13322          rnode4_t                *rp = VTOR4(vp);
13315 13323  
13316 13324          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13317 13325          ASSERT(ctype == NFS4_LCK_CTYPE_NORM);
13318 13326  
13319 13327          nfs4frlock_check_deleg(vp, ep, cr, F_UNLCK);
13320 13328          if (ep->error || ep->stat)
13321 13329                  return;
13322 13330  
13323 13331          argop->argop = OP_LOCKU;
13324 13332          if (ctype == NFS4_LCK_CTYPE_REINSTATE)
13325 13333                  argsp->ctag = TAG_LOCKU_REINSTATE;
13326 13334          else
13327 13335                  argsp->ctag = TAG_LOCKU;
13328 13336          locku_args = &argop->nfs_argop4_u.oplocku;
13329 13337          *locku_argsp = locku_args;
13330 13338  
13331 13339          /*
13332 13340           * XXX what should locku_args->locktype be?
13333 13341           * setting to ALWAYS be READ_LT so at least
13334 13342           * it is a valid locktype.
13335 13343           */
13336 13344  
13337 13345          locku_args->locktype = READ_LT;
13338 13346  
13339 13347          pid = ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pidp->pid_id :
13340 13348              flk->l_pid;
13341 13349  
13342 13350          /*
13343 13351           * Get the lock owner stateid.  If no lock owner
13344 13352           * exists, return success.
13345 13353           */
13346 13354          lop = find_lock_owner(rp, pid, LOWN_ANY);
13347 13355          *lopp = lop;
13348 13356          if (lop && CLNT_ISSPECIAL(&lop->lock_stateid))
13349 13357                  is_spec = TRUE;
13350 13358          if (!lop || is_spec) {
13351 13359                  /*
13352 13360                   * No lock owner so no locks to unlock.
13353 13361                   * Return success.  If there was a failed
13354 13362                   * reclaim earlier, the lock might still be
13355 13363                   * registered with the local locking code,
13356 13364                   * so notify it of the unlock.
13357 13365                   *
13358 13366                   * If the lockowner is using a special stateid,
13359 13367                   * then the original lock request (that created
13360 13368                   * this lockowner) was never successful, so we
13361 13369                   * have no lock to undo OTW.
13362 13370                   */
13363 13371                  NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
13364 13372                      "nfs4frlock_setup_locku_args: LOCKU: no lock owner "
13365 13373                      "(%ld) so return success", (long)pid));
13366 13374  
13367 13375                  if (ctype == NFS4_LCK_CTYPE_NORM)
13368 13376                          flk->l_pid = curproc->p_pid;
13369 13377                  nfs4_register_lock_locally(vp, flk, flag, offset);
13370 13378                  /*
13371 13379                   * Release our hold and NULL out so final_cleanup
13372 13380                   * doesn't try to end a lock seqid sync we
13373 13381                   * never started.
13374 13382                   */
13375 13383                  if (is_spec) {
13376 13384                          lock_owner_rele(lop);
13377 13385                          *lopp = NULL;
13378 13386                  }
13379 13387                  *skip_get_err = TRUE;
13380 13388                  *go_otwp = FALSE;
13381 13389                  return;
13382 13390          }
13383 13391  
13384 13392          ep->error = nfs4_start_lock_seqid_sync(lop, VTOMI4(vp));
13385 13393          if (ep->error == EAGAIN) {
13386 13394                  lock_owner_rele(lop);
13387 13395                  *lopp = NULL;
13388 13396                  return;
13389 13397          }
13390 13398  
13391 13399          mutex_enter(&lop->lo_lock);
13392 13400          locku_args->lock_stateid = lop->lock_stateid;
13393 13401          mutex_exit(&lop->lo_lock);
13394 13402          locku_args->seqid = lop->lock_seqid + 1;
13395 13403  
13396 13404          /* leave the ref count on lop, rele after RPC call */
13397 13405  
13398 13406          locku_args->offset = flk->l_start;
13399 13407          locku_args->length = flk->l_len;
13400 13408          if (flk->l_len == 0)
13401 13409                  locku_args->length = ~locku_args->length;
13402 13410  
13403 13411          *go_otwp = TRUE;
13404 13412  }
13405 13413  
13406 13414  /*
13407 13415   * Setup the LOCK4 arguments.
13408 13416   *
13409 13417   * Returns errors via the nfs4_error_t.
13410 13418   * NFS4_OK              no problems
13411 13419   * NFS4ERR_DELAY        caller should retry (like recovery retry)
13412 13420   * (other)              unrecoverable error
13413 13421   */
13414 13422  static void
13415 13423  nfs4frlock_setup_lock_args(nfs4_lock_call_type_t ctype, LOCK4args **lock_argsp,
13416 13424      nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp,
13417 13425      nfs4_lock_owner_t **lopp, nfs_argop4 *argop, COMPOUND4args_clnt *argsp,
13418 13426      flock64_t *flk, int cmd, vnode_t *vp, cred_t *cr, nfs4_error_t *ep)
13419 13427  {
13420 13428          LOCK4args               *lock_args;
13421 13429          nfs4_open_owner_t       *oop = NULL;
13422 13430          nfs4_open_stream_t      *osp = NULL;
13423 13431          nfs4_lock_owner_t       *lop = NULL;
13424 13432          pid_t                   pid;
13425 13433          rnode4_t                *rp = VTOR4(vp);
13426 13434  
13427 13435          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13428 13436  
13429 13437          nfs4frlock_check_deleg(vp, ep, cr, flk->l_type);
13430 13438          if (ep->error || ep->stat != NFS4_OK)
13431 13439                  return;
13432 13440  
13433 13441          argop->argop = OP_LOCK;
13434 13442          if (ctype == NFS4_LCK_CTYPE_NORM)
13435 13443                  argsp->ctag = TAG_LOCK;
13436 13444          else if (ctype == NFS4_LCK_CTYPE_RECLAIM)
13437 13445                  argsp->ctag = TAG_RELOCK;
13438 13446          else
13439 13447                  argsp->ctag = TAG_LOCK_REINSTATE;
13440 13448          lock_args = &argop->nfs_argop4_u.oplock;
13441 13449          lock_args->locktype = flk_to_locktype(cmd, flk->l_type);
13442 13450          lock_args->reclaim = ctype == NFS4_LCK_CTYPE_RECLAIM ? 1 : 0;
13443 13451          /*
13444 13452           * Get the lock owner.  If no lock owner exists,
13445 13453           * create a 'temporary' one and grab the open seqid
13446 13454           * synchronization (which puts a hold on the open
13447 13455           * owner and open stream).
13448 13456           * This also grabs the lock seqid synchronization.
13449 13457           */
13450 13458          pid = ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pid : flk->l_pid;
13451 13459          ep->stat =
13452 13460              nfs4_find_or_create_lock_owner(pid, rp, cr, &oop, &osp, &lop);
13453 13461  
13454 13462          if (ep->stat != NFS4_OK)
13455 13463                  goto out;
13456 13464  
13457 13465          nfs4_setup_lock_args(lop, oop, osp, mi2clientid(VTOMI4(vp)),
13458 13466              &lock_args->locker);
13459 13467  
13460 13468          lock_args->offset = flk->l_start;
13461 13469          lock_args->length = flk->l_len;
13462 13470          if (flk->l_len == 0)
13463 13471                  lock_args->length = ~lock_args->length;
13464 13472          *lock_argsp = lock_args;
13465 13473  out:
13466 13474          *oopp = oop;
13467 13475          *ospp = osp;
13468 13476          *lopp = lop;
13469 13477  }
13470 13478  
13471 13479  /*
13472 13480   * After we get the reply from the server, record the proper information
13473 13481   * for possible resend lock requests.
13474 13482   *
13475 13483   * Allocates memory for the saved_rqstp if we have a lost lock to save.
13476 13484   */
13477 13485  static void
13478 13486  nfs4frlock_save_lost_rqst(nfs4_lock_call_type_t ctype, int error,
13479 13487      nfs_lock_type4 locktype, nfs4_open_owner_t *oop,
13480 13488      nfs4_open_stream_t *osp, nfs4_lock_owner_t *lop, flock64_t *flk,
13481 13489      nfs4_lost_rqst_t *lost_rqstp, cred_t *cr, vnode_t *vp)
13482 13490  {
13483 13491          bool_t unlock = (flk->l_type == F_UNLCK);
13484 13492  
13485 13493          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13486 13494          ASSERT(ctype == NFS4_LCK_CTYPE_NORM ||
13487 13495              ctype == NFS4_LCK_CTYPE_REINSTATE);
13488 13496  
13489 13497          if (error != 0 && !unlock) {
13490 13498                  NFS4_DEBUG((nfs4_lost_rqst_debug ||
13491 13499                      nfs4_client_lock_debug), (CE_NOTE,
13492 13500                      "nfs4frlock_save_lost_rqst: set lo_pending_rqsts to 1 "
13493 13501                      " for lop %p", (void *)lop));
13494 13502                  ASSERT(lop != NULL);
13495 13503                  mutex_enter(&lop->lo_lock);
13496 13504                  lop->lo_pending_rqsts = 1;
13497 13505                  mutex_exit(&lop->lo_lock);
13498 13506          }
13499 13507  
13500 13508          lost_rqstp->lr_putfirst = FALSE;
13501 13509          lost_rqstp->lr_op = 0;
13502 13510  
13503 13511          /*
13504 13512           * For lock/locku requests, we treat EINTR as ETIMEDOUT for
13505 13513           * recovery purposes so that the lock request that was sent
13506 13514           * can be saved and re-issued later.  Ditto for EIO from a forced
13507 13515           * unmount.  This is done to have the client's local locking state
13508 13516           * match the v4 server's state; that is, the request was
13509 13517           * potentially received and accepted by the server but the client
13510 13518           * thinks it was not.
13511 13519           */
13512 13520          if (error == ETIMEDOUT || error == EINTR ||
13513 13521              NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) {
13514 13522                  NFS4_DEBUG((nfs4_lost_rqst_debug ||
13515 13523                      nfs4_client_lock_debug), (CE_NOTE,
13516 13524                      "nfs4frlock_save_lost_rqst: got a lost %s lock for "
13517 13525                      "lop %p oop %p osp %p", unlock ? "LOCKU" : "LOCK",
13518 13526                      (void *)lop, (void *)oop, (void *)osp));
13519 13527                  if (unlock)
13520 13528                          lost_rqstp->lr_op = OP_LOCKU;
13521 13529                  else {
13522 13530                          lost_rqstp->lr_op = OP_LOCK;
13523 13531                          lost_rqstp->lr_locktype = locktype;
13524 13532                  }
13525 13533                  /*
13526 13534                   * Objects are held and rele'd via the recovery code.
13527 13535                   * See nfs4_save_lost_rqst.
13528 13536                   */
13529 13537                  lost_rqstp->lr_vp = vp;
13530 13538                  lost_rqstp->lr_dvp = NULL;
13531 13539                  lost_rqstp->lr_oop = oop;
13532 13540                  lost_rqstp->lr_osp = osp;
13533 13541                  lost_rqstp->lr_lop = lop;
13534 13542                  lost_rqstp->lr_cr = cr;
13535 13543                  switch (ctype) {
13536 13544                  case NFS4_LCK_CTYPE_NORM:
13537 13545                          flk->l_pid = ttoproc(curthread)->p_pid;
13538 13546                          lost_rqstp->lr_ctype = NFS4_LCK_CTYPE_RESEND;
13539 13547                          break;
13540 13548                  case NFS4_LCK_CTYPE_REINSTATE:
13541 13549                          lost_rqstp->lr_putfirst = TRUE;
13542 13550                          lost_rqstp->lr_ctype = ctype;
13543 13551                          break;
13544 13552                  default:
13545 13553                          break;
13546 13554                  }
13547 13555                  lost_rqstp->lr_flk = flk;
13548 13556          }
13549 13557  }
13550 13558  
13551 13559  /*
13552 13560   * Update lop's seqid.  Also update the seqid stored in a resend request,
13553 13561   * if any.  (Some recovery errors increment the seqid, and we may have to
13554 13562   * send the resend request again.)
13555 13563   */
13556 13564  
13557 13565  static void
13558 13566  nfs4frlock_bump_seqid(LOCK4args *lock_args, LOCKU4args *locku_args,
13559 13567      nfs4_open_owner_t *oop, nfs4_lock_owner_t *lop, nfs4_tag_type_t tag_type)
13560 13568  {
13561 13569          if (lock_args) {
13562 13570                  if (lock_args->locker.new_lock_owner == TRUE)
13563 13571                          nfs4_get_and_set_next_open_seqid(oop, tag_type);
13564 13572                  else {
13565 13573                          ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE);
13566 13574                          nfs4_set_lock_seqid(lop->lock_seqid + 1, lop);
13567 13575                  }
13568 13576          } else if (locku_args) {
13569 13577                  ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE);
13570 13578                  nfs4_set_lock_seqid(lop->lock_seqid +1, lop);
13571 13579          }
13572 13580  }
13573 13581  
13574 13582  /*
13575 13583   * Calls nfs4_end_fop, drops the seqid syncs, and frees up the
13576 13584   * COMPOUND4 args/res for calls that need to retry.
13577 13585   * Switches the *cred_otwp to base_cr.
13578 13586   */
13579 13587  static void
13580 13588  nfs4frlock_check_access(vnode_t *vp, nfs4_op_hint_t op_hint,
13581 13589      nfs4_recov_state_t *recov_statep, int needrecov, bool_t *did_start_fop,
13582 13590      COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp, int error,
13583 13591      nfs4_lock_owner_t **lopp, nfs4_open_owner_t **oopp,
13584 13592      nfs4_open_stream_t **ospp, cred_t *base_cr, cred_t **cred_otwp)
13585 13593  {
13586 13594          nfs4_open_owner_t       *oop = *oopp;
13587 13595          nfs4_open_stream_t      *osp = *ospp;
13588 13596          nfs4_lock_owner_t       *lop = *lopp;
13589 13597          nfs_argop4              *argop = (*argspp)->array;
13590 13598  
13591 13599          if (*did_start_fop) {
13592 13600                  nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, recov_statep,
13593 13601                      needrecov);
13594 13602                  *did_start_fop = FALSE;
13595 13603          }
13596 13604          ASSERT((*argspp)->array_len == 2);
13597 13605          if (argop[1].argop == OP_LOCK)
13598 13606                  nfs4args_lock_free(&argop[1]);
13599 13607          else if (argop[1].argop == OP_LOCKT)
13600 13608                  nfs4args_lockt_free(&argop[1]);
13601 13609          kmem_free(argop, 2 * sizeof (nfs_argop4));
13602 13610          if (!error)
13603 13611                  (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)*respp);
13604 13612          *argspp = NULL;
13605 13613          *respp = NULL;
13606 13614  
13607 13615          if (lop) {
13608 13616                  nfs4_end_lock_seqid_sync(lop);
13609 13617                  lock_owner_rele(lop);
13610 13618                  *lopp = NULL;
13611 13619          }
13612 13620  
13613 13621          /* need to free up the reference on osp for lock args */
13614 13622          if (osp != NULL) {
13615 13623                  open_stream_rele(osp, VTOR4(vp));
13616 13624                  *ospp = NULL;
13617 13625          }
13618 13626  
13619 13627          /* need to free up the reference on oop for lock args */
13620 13628          if (oop != NULL) {
13621 13629                  nfs4_end_open_seqid_sync(oop);
13622 13630                  open_owner_rele(oop);
13623 13631                  *oopp = NULL;
13624 13632          }
13625 13633  
13626 13634          crfree(*cred_otwp);
13627 13635          *cred_otwp = base_cr;
13628 13636          crhold(*cred_otwp);
13629 13637  }
13630 13638  
13631 13639  /*
13632 13640   * Function to process the client's recovery for nfs4frlock.
13633 13641   * Returns TRUE if we should retry the lock request; FALSE otherwise.
13634 13642   *
13635 13643   * Calls nfs4_end_fop, drops the seqid syncs, and frees up the
13636 13644   * COMPOUND4 args/res for calls that need to retry.
13637 13645   *
13638 13646   * Note: the rp's r_lkserlock is *not* dropped during this path.
13639 13647   */
13640 13648  static bool_t
13641 13649  nfs4frlock_recovery(int needrecov, nfs4_error_t *ep,
13642 13650      COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp,
13643 13651      LOCK4args *lock_args, LOCKU4args *locku_args,
13644 13652      nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp,
13645 13653      nfs4_lock_owner_t **lopp, rnode4_t *rp, vnode_t *vp,
13646 13654      nfs4_recov_state_t *recov_statep, nfs4_op_hint_t op_hint,
13647 13655      bool_t *did_start_fop, nfs4_lost_rqst_t *lost_rqstp, flock64_t *flk)
13648 13656  {
13649 13657          nfs4_open_owner_t       *oop = *oopp;
13650 13658          nfs4_open_stream_t      *osp = *ospp;
13651 13659          nfs4_lock_owner_t       *lop = *lopp;
13652 13660  
13653 13661          bool_t abort, retry;
13654 13662  
13655 13663          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13656 13664          ASSERT((*argspp) != NULL);
13657 13665          ASSERT((*respp) != NULL);
13658 13666          if (lock_args || locku_args)
13659 13667                  ASSERT(lop != NULL);
13660 13668  
13661 13669          NFS4_DEBUG((nfs4_client_lock_debug || nfs4_client_recov_debug),
13662 13670              (CE_NOTE, "nfs4frlock_recovery: initiating recovery\n"));
13663 13671  
13664 13672          retry = TRUE;
13665 13673          abort = FALSE;
13666 13674          if (needrecov) {
13667 13675                  nfs4_bseqid_entry_t *bsep = NULL;
13668 13676                  nfs_opnum4 op;
13669 13677  
13670 13678                  op = lock_args ? OP_LOCK : locku_args ? OP_LOCKU : OP_LOCKT;
13671 13679  
13672 13680                  if (!ep->error && ep->stat == NFS4ERR_BAD_SEQID) {
13673 13681                          seqid4 seqid;
13674 13682  
13675 13683                          if (lock_args) {
13676 13684                                  if (lock_args->locker.new_lock_owner == TRUE)
13677 13685                                          seqid = lock_args->locker.locker4_u.
13678 13686                                              open_owner.open_seqid;
13679 13687                                  else
13680 13688                                          seqid = lock_args->locker.locker4_u.
13681 13689                                              lock_owner.lock_seqid;
13682 13690                          } else if (locku_args) {
13683 13691                                  seqid = locku_args->seqid;
13684 13692                          } else {
13685 13693                                  seqid = 0;
13686 13694                          }
13687 13695  
13688 13696                          bsep = nfs4_create_bseqid_entry(oop, lop, vp,
13689 13697                              flk->l_pid, (*argspp)->ctag, seqid);
13690 13698                  }
13691 13699  
13692 13700                  abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, NULL,
13693 13701                      (lost_rqstp && (lost_rqstp->lr_op == OP_LOCK ||
13694 13702                      lost_rqstp->lr_op == OP_LOCKU)) ? lost_rqstp :
13695 13703                      NULL, op, bsep, NULL, NULL);
13696 13704  
13697 13705                  if (bsep)
13698 13706                          kmem_free(bsep, sizeof (*bsep));
13699 13707          }
13700 13708  
13701 13709          /*
13702 13710           * Return that we do not want to retry the request for 3 cases:
13703 13711           * 1. If we received EINTR or are bailing out because of a forced
13704 13712           *    unmount, we came into this code path just for the sake of
13705 13713           *    initiating recovery, we now need to return the error.
13706 13714           * 2. If we have aborted recovery.
13707 13715           * 3. We received NFS4ERR_BAD_SEQID.
13708 13716           */
13709 13717          if (ep->error == EINTR || NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp) ||
13710 13718              abort == TRUE || (ep->error == 0 && ep->stat == NFS4ERR_BAD_SEQID))
13711 13719                  retry = FALSE;
13712 13720  
13713 13721          if (*did_start_fop == TRUE) {
13714 13722                  nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, recov_statep,
13715 13723                      needrecov);
13716 13724                  *did_start_fop = FALSE;
13717 13725          }
13718 13726  
13719 13727          if (retry == TRUE) {
13720 13728                  nfs_argop4      *argop;
13721 13729  
13722 13730                  argop = (*argspp)->array;
13723 13731                  ASSERT((*argspp)->array_len == 2);
13724 13732  
13725 13733                  if (argop[1].argop == OP_LOCK)
13726 13734                          nfs4args_lock_free(&argop[1]);
13727 13735                  else if (argop[1].argop == OP_LOCKT)
13728 13736                          nfs4args_lockt_free(&argop[1]);
13729 13737                  kmem_free(argop, 2 * sizeof (nfs_argop4));
13730 13738                  if (!ep->error)
13731 13739                          (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)*respp);
13732 13740                  *respp = NULL;
13733 13741                  *argspp = NULL;
13734 13742          }
13735 13743  
13736 13744          if (lop != NULL) {
13737 13745                  nfs4_end_lock_seqid_sync(lop);
13738 13746                  lock_owner_rele(lop);
13739 13747          }
13740 13748  
13741 13749          *lopp = NULL;
13742 13750  
13743 13751          /* need to free up the reference on osp for lock args */
13744 13752          if (osp != NULL) {
13745 13753                  open_stream_rele(osp, rp);
13746 13754                  *ospp = NULL;
13747 13755          }
13748 13756  
13749 13757          /* need to free up the reference on oop for lock args */
13750 13758          if (oop != NULL) {
13751 13759                  nfs4_end_open_seqid_sync(oop);
13752 13760                  open_owner_rele(oop);
13753 13761                  *oopp = NULL;
13754 13762          }
13755 13763  
13756 13764          return (retry);
13757 13765  }
13758 13766  
13759 13767  /*
13760 13768   * Handles the successful reply from the server for nfs4frlock.
13761 13769   */
13762 13770  static void
13763 13771  nfs4frlock_results_ok(nfs4_lock_call_type_t ctype, int cmd, flock64_t *flk,
13764 13772      vnode_t *vp, int flag, u_offset_t offset,
13765 13773      nfs4_lost_rqst_t *resend_rqstp)
13766 13774  {
13767 13775          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13768 13776          if ((cmd == F_SETLK || cmd == F_SETLKW) &&
13769 13777              (flk->l_type == F_RDLCK || flk->l_type == F_WRLCK)) {
13770 13778                  if (ctype == NFS4_LCK_CTYPE_NORM) {
13771 13779                          flk->l_pid = ttoproc(curthread)->p_pid;
13772 13780                          /*
13773 13781                           * We do not register lost locks locally in
13774 13782                           * the 'resend' case since the user/application
13775 13783                           * doesn't think we have the lock.
13776 13784                           */
13777 13785                          ASSERT(!resend_rqstp);
13778 13786                          nfs4_register_lock_locally(vp, flk, flag, offset);
13779 13787                  }
13780 13788          }
13781 13789  }
13782 13790  
13783 13791  /*
13784 13792   * Handle the DENIED reply from the server for nfs4frlock.
13785 13793   * Returns TRUE if we should retry the request; FALSE otherwise.
13786 13794   *
13787 13795   * Calls nfs4_end_fop, drops the seqid syncs, and frees up the
13788 13796   * COMPOUND4 args/res for calls that need to retry.  Can also
13789 13797   * drop and regrab the r_lkserlock.
13790 13798   */
13791 13799  static bool_t
13792 13800  nfs4frlock_results_denied(nfs4_lock_call_type_t ctype, LOCK4args *lock_args,
13793 13801      LOCKT4args *lockt_args, nfs4_open_owner_t **oopp,
13794 13802      nfs4_open_stream_t **ospp, nfs4_lock_owner_t **lopp, int cmd,
13795 13803      vnode_t *vp, flock64_t *flk, nfs4_op_hint_t op_hint,
13796 13804      nfs4_recov_state_t *recov_statep, int needrecov,
13797 13805      COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp,
13798 13806      clock_t *tick_delayp, short *whencep, int *errorp,
13799 13807      nfs_resop4 *resop, cred_t *cr, bool_t *did_start_fop,
13800 13808      bool_t *skip_get_err)
13801 13809  {
13802 13810          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13803 13811  
13804 13812          if (lock_args) {
13805 13813                  nfs4_open_owner_t       *oop = *oopp;
13806 13814                  nfs4_open_stream_t      *osp = *ospp;
13807 13815                  nfs4_lock_owner_t       *lop = *lopp;
13808 13816                  int                     intr;
13809 13817  
13810 13818                  /*
13811 13819                   * Blocking lock needs to sleep and retry from the request.
13812 13820                   *
13813 13821                   * Do not block and wait for 'resend' or 'reinstate'
13814 13822                   * lock requests, just return the error.
13815 13823                   *
13816 13824                   * Note: reclaim requests have cmd == F_SETLK, not F_SETLKW.
13817 13825                   */
13818 13826                  if (cmd == F_SETLKW) {
13819 13827                          rnode4_t *rp = VTOR4(vp);
13820 13828                          nfs_argop4 *argop = (*argspp)->array;
13821 13829  
13822 13830                          ASSERT(ctype == NFS4_LCK_CTYPE_NORM);
13823 13831  
13824 13832                          nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint,
13825 13833                              recov_statep, needrecov);
13826 13834                          *did_start_fop = FALSE;
13827 13835                          ASSERT((*argspp)->array_len == 2);
13828 13836                          if (argop[1].argop == OP_LOCK)
13829 13837                                  nfs4args_lock_free(&argop[1]);
13830 13838                          else if (argop[1].argop == OP_LOCKT)
13831 13839                                  nfs4args_lockt_free(&argop[1]);
13832 13840                          kmem_free(argop, 2 * sizeof (nfs_argop4));
13833 13841                          if (*respp)
13834 13842                                  (void) xdr_free(xdr_COMPOUND4res_clnt,
13835 13843                                      (caddr_t)*respp);
13836 13844                          *argspp = NULL;
13837 13845                          *respp = NULL;
13838 13846                          nfs4_end_lock_seqid_sync(lop);
13839 13847                          lock_owner_rele(lop);
13840 13848                          *lopp = NULL;
13841 13849                          if (osp != NULL) {
13842 13850                                  open_stream_rele(osp, rp);
13843 13851                                  *ospp = NULL;
13844 13852                          }
13845 13853                          if (oop != NULL) {
13846 13854                                  nfs4_end_open_seqid_sync(oop);
13847 13855                                  open_owner_rele(oop);
13848 13856                                  *oopp = NULL;
13849 13857                          }
13850 13858  
13851 13859                          nfs_rw_exit(&rp->r_lkserlock);
13852 13860  
13853 13861                          intr = nfs4_block_and_wait(tick_delayp, rp);
13854 13862  
13855 13863                          if (intr) {
13856 13864                                  (void) nfs_rw_enter_sig(&rp->r_lkserlock,
13857 13865                                      RW_WRITER, FALSE);
13858 13866                                  *errorp = EINTR;
13859 13867                                  return (FALSE);
13860 13868                          }
13861 13869  
13862 13870                          (void) nfs_rw_enter_sig(&rp->r_lkserlock,
13863 13871                              RW_WRITER, FALSE);
13864 13872  
13865 13873                          /*
13866 13874                           * Make sure we are still safe to lock with
13867 13875                           * regards to mmapping.
13868 13876                           */
13869 13877                          if (!nfs4_safelock(vp, flk, cr)) {
13870 13878                                  *errorp = EAGAIN;
13871 13879                                  return (FALSE);
13872 13880                          }
13873 13881  
13874 13882                          return (TRUE);
13875 13883                  }
13876 13884                  if (ctype == NFS4_LCK_CTYPE_NORM)
13877 13885                          *errorp = EAGAIN;
13878 13886                  *skip_get_err = TRUE;
13879 13887                  flk->l_whence = 0;
13880 13888                  *whencep = 0;
13881 13889                  return (FALSE);
13882 13890          } else if (lockt_args) {
13883 13891                  NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
13884 13892                      "nfs4frlock_results_denied: OP_LOCKT DENIED"));
13885 13893  
13886 13894                  denied_to_flk(&resop->nfs_resop4_u.oplockt.denied,
13887 13895                      flk, lockt_args);
13888 13896  
13889 13897                  /* according to NLM code */
13890 13898                  *errorp = 0;
13891 13899                  *whencep = 0;
13892 13900                  *skip_get_err = TRUE;
13893 13901                  return (FALSE);
13894 13902          }
13895 13903          return (FALSE);
13896 13904  }
13897 13905  
13898 13906  /*
13899 13907   * Handles all NFS4 errors besides NFS4_OK and NFS4ERR_DENIED for nfs4frlock.
13900 13908   */
13901 13909  static void
13902 13910  nfs4frlock_results_default(COMPOUND4res_clnt *resp, int *errorp)
13903 13911  {
13904 13912          switch (resp->status) {
13905 13913          case NFS4ERR_ACCESS:
13906 13914          case NFS4ERR_ADMIN_REVOKED:
13907 13915          case NFS4ERR_BADHANDLE:
13908 13916          case NFS4ERR_BAD_RANGE:
13909 13917          case NFS4ERR_BAD_SEQID:
13910 13918          case NFS4ERR_BAD_STATEID:
13911 13919          case NFS4ERR_BADXDR:
13912 13920          case NFS4ERR_DEADLOCK:
13913 13921          case NFS4ERR_DELAY:
13914 13922          case NFS4ERR_EXPIRED:
13915 13923          case NFS4ERR_FHEXPIRED:
13916 13924          case NFS4ERR_GRACE:
13917 13925          case NFS4ERR_INVAL:
13918 13926          case NFS4ERR_ISDIR:
13919 13927          case NFS4ERR_LEASE_MOVED:
13920 13928          case NFS4ERR_LOCK_NOTSUPP:
13921 13929          case NFS4ERR_LOCK_RANGE:
13922 13930          case NFS4ERR_MOVED:
13923 13931          case NFS4ERR_NOFILEHANDLE:
13924 13932          case NFS4ERR_NO_GRACE:
13925 13933          case NFS4ERR_OLD_STATEID:
13926 13934          case NFS4ERR_OPENMODE:
13927 13935          case NFS4ERR_RECLAIM_BAD:
13928 13936          case NFS4ERR_RECLAIM_CONFLICT:
13929 13937          case NFS4ERR_RESOURCE:
13930 13938          case NFS4ERR_SERVERFAULT:
13931 13939          case NFS4ERR_STALE:
13932 13940          case NFS4ERR_STALE_CLIENTID:
13933 13941          case NFS4ERR_STALE_STATEID:
13934 13942                  return;
13935 13943          default:
13936 13944                  NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
13937 13945                      "nfs4frlock_results_default: got unrecognizable "
13938 13946                      "res.status %d", resp->status));
13939 13947                  *errorp = NFS4ERR_INVAL;
13940 13948          }
13941 13949  }
13942 13950  
13943 13951  /*
13944 13952   * The lock request was successful, so update the client's state.
13945 13953   */
13946 13954  static void
13947 13955  nfs4frlock_update_state(LOCK4args *lock_args, LOCKU4args *locku_args,
13948 13956      LOCKT4args *lockt_args, nfs_resop4 *resop, nfs4_lock_owner_t *lop,
13949 13957      vnode_t *vp, flock64_t *flk, cred_t *cr,
13950 13958      nfs4_lost_rqst_t *resend_rqstp)
13951 13959  {
13952 13960          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13953 13961  
13954 13962          if (lock_args) {
13955 13963                  LOCK4res *lock_res;
13956 13964  
13957 13965                  lock_res = &resop->nfs_resop4_u.oplock;
13958 13966                  /* update the stateid with server's response */
13959 13967  
13960 13968                  if (lock_args->locker.new_lock_owner == TRUE) {
13961 13969                          mutex_enter(&lop->lo_lock);
13962 13970                          lop->lo_just_created = NFS4_PERM_CREATED;
13963 13971                          mutex_exit(&lop->lo_lock);
13964 13972                  }
13965 13973  
13966 13974                  nfs4_set_lock_stateid(lop, lock_res->LOCK4res_u.lock_stateid);
13967 13975  
13968 13976                  /*
13969 13977                   * If the lock was the result of a resending a lost
13970 13978                   * request, we've synched up the stateid and seqid
13971 13979                   * with the server, but now the server might be out of sync
13972 13980                   * with what the application thinks it has for locks.
13973 13981                   * Clean that up here.  It's unclear whether we should do
13974 13982                   * this even if the filesystem has been forcibly unmounted.
13975 13983                   * For most servers, it's probably wasted effort, but
13976 13984                   * RFC3530 lets servers require that unlocks exactly match
13977 13985                   * the locks that are held.
13978 13986                   */
13979 13987                  if (resend_rqstp != NULL &&
13980 13988                      resend_rqstp->lr_ctype != NFS4_LCK_CTYPE_REINSTATE) {
13981 13989                          nfs4_reinstitute_local_lock_state(vp, flk, cr, lop);
13982 13990                  } else {
13983 13991                          flk->l_whence = 0;
13984 13992                  }
13985 13993          } else if (locku_args) {
13986 13994                  LOCKU4res *locku_res;
13987 13995  
13988 13996                  locku_res = &resop->nfs_resop4_u.oplocku;
13989 13997  
13990 13998                  /* Update the stateid with the server's response */
13991 13999                  nfs4_set_lock_stateid(lop, locku_res->lock_stateid);
13992 14000          } else if (lockt_args) {
13993 14001                  /* Switch the lock type to express success, see fcntl */
13994 14002                  flk->l_type = F_UNLCK;
13995 14003                  flk->l_whence = 0;
13996 14004          }
13997 14005  }
13998 14006  
13999 14007  /*
14000 14008   * Do final cleanup before exiting nfs4frlock.
14001 14009   * Calls nfs4_end_fop, drops the seqid syncs, and frees up the
14002 14010   * COMPOUND4 args/res for calls that haven't already.
14003 14011   */
14004 14012  static void
14005 14013  nfs4frlock_final_cleanup(nfs4_lock_call_type_t ctype, COMPOUND4args_clnt *argsp,
14006 14014      COMPOUND4res_clnt *resp, vnode_t *vp, nfs4_op_hint_t op_hint,
14007 14015      nfs4_recov_state_t *recov_statep, int needrecov, nfs4_open_owner_t *oop,
14008 14016      nfs4_open_stream_t *osp, nfs4_lock_owner_t *lop, flock64_t *flk,
14009 14017      short whence, u_offset_t offset, struct lm_sysid *ls,
14010 14018      int *errorp, LOCK4args *lock_args, LOCKU4args *locku_args,
14011 14019      bool_t did_start_fop, bool_t skip_get_err,
14012 14020      cred_t *cred_otw, cred_t *cred)
14013 14021  {
14014 14022          mntinfo4_t      *mi = VTOMI4(vp);
14015 14023          rnode4_t        *rp = VTOR4(vp);
14016 14024          int             error = *errorp;
14017 14025          nfs_argop4      *argop;
14018 14026          int     do_flush_pages = 0;
14019 14027  
14020 14028          ASSERT(nfs_zone() == mi->mi_zone);
14021 14029          /*
14022 14030           * The client recovery code wants the raw status information,
14023 14031           * so don't map the NFS status code to an errno value for
14024 14032           * non-normal call types.
14025 14033           */
14026 14034          if (ctype == NFS4_LCK_CTYPE_NORM) {
14027 14035                  if (*errorp == 0 && resp != NULL && skip_get_err == FALSE)
14028 14036                          *errorp = geterrno4(resp->status);
14029 14037                  if (did_start_fop == TRUE)
14030 14038                          nfs4_end_fop(mi, vp, NULL, op_hint, recov_statep,
14031 14039                              needrecov);
14032 14040  
14033 14041                  /*
14034 14042                   * We've established a new lock on the server, so invalidate
14035 14043                   * the pages associated with the vnode to get the most up to
14036 14044                   * date pages from the server after acquiring the lock. We
14037 14045                   * want to be sure that the read operation gets the newest data.
14038 14046                   * N.B.
14039 14047                   * We used to do this in nfs4frlock_results_ok but that doesn't
14040 14048                   * work since VOP_PUTPAGE can call nfs4_commit which calls
14041 14049                   * nfs4_start_fop. We flush the pages below after calling
14042 14050                   * nfs4_end_fop above
14043 14051                   * The flush of the page cache must be done after
14044 14052                   * nfs4_end_open_seqid_sync() to avoid a 4-way hang.
14045 14053                   */
14046 14054                  if (!error && resp && resp->status == NFS4_OK)
14047 14055                          do_flush_pages = 1;
14048 14056          }
14049 14057          if (argsp) {
14050 14058                  ASSERT(argsp->array_len == 2);
14051 14059                  argop = argsp->array;
14052 14060                  if (argop[1].argop == OP_LOCK)
14053 14061                          nfs4args_lock_free(&argop[1]);
14054 14062                  else if (argop[1].argop == OP_LOCKT)
14055 14063                          nfs4args_lockt_free(&argop[1]);
14056 14064                  kmem_free(argop, 2 * sizeof (nfs_argop4));
14057 14065                  if (resp)
14058 14066                          (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
14059 14067          }
14060 14068  
14061 14069          /* free the reference on the lock owner */
14062 14070          if (lop != NULL) {
14063 14071                  nfs4_end_lock_seqid_sync(lop);
14064 14072                  lock_owner_rele(lop);
14065 14073          }
14066 14074  
14067 14075          /* need to free up the reference on osp for lock args */
14068 14076          if (osp != NULL)
14069 14077                  open_stream_rele(osp, rp);
14070 14078  
14071 14079          /* need to free up the reference on oop for lock args */
14072 14080          if (oop != NULL) {
14073 14081                  nfs4_end_open_seqid_sync(oop);
14074 14082                  open_owner_rele(oop);
14075 14083          }
14076 14084  
14077 14085          if (do_flush_pages)
14078 14086                  nfs4_flush_pages(vp, cred);
14079 14087  
14080 14088          (void) convoff(vp, flk, whence, offset);
14081 14089  
14082 14090          lm_rel_sysid(ls);
14083 14091  
14084 14092          /*
14085 14093           * Record debug information in the event we get EINVAL.
14086 14094           */
14087 14095          mutex_enter(&mi->mi_lock);
14088 14096          if (*errorp == EINVAL && (lock_args || locku_args) &&
14089 14097              (!(mi->mi_flags & MI4_POSIX_LOCK))) {
14090 14098                  if (!(mi->mi_flags & MI4_LOCK_DEBUG)) {
14091 14099                          zcmn_err(getzoneid(), CE_NOTE,
14092 14100                              "%s operation failed with "
14093 14101                              "EINVAL probably since the server, %s,"
14094 14102                              " doesn't support POSIX style locking",
14095 14103                              lock_args ? "LOCK" : "LOCKU",
14096 14104                              mi->mi_curr_serv->sv_hostname);
14097 14105                          mi->mi_flags |= MI4_LOCK_DEBUG;
14098 14106                  }
14099 14107          }
14100 14108          mutex_exit(&mi->mi_lock);
14101 14109  
14102 14110          if (cred_otw)
14103 14111                  crfree(cred_otw);
14104 14112  }
14105 14113  
14106 14114  /*
14107 14115   * This calls the server and the local locking code.
14108 14116   *
14109 14117   * Client locks are registerred locally by oring the sysid with
14110 14118   * LM_SYSID_CLIENT. The server registers locks locally using just the sysid.
14111 14119   * We need to distinguish between the two to avoid collision in case one
14112 14120   * machine is used as both client and server.
14113 14121   *
14114 14122   * Blocking lock requests will continually retry to acquire the lock
14115 14123   * forever.
14116 14124   *
14117 14125   * The ctype is defined as follows:
14118 14126   * NFS4_LCK_CTYPE_NORM: normal lock request.
14119 14127   *
14120 14128   * NFS4_LCK_CTYPE_RECLAIM:  bypass the usual calls for synchronizing with client
14121 14129   * recovery, get the pid from flk instead of curproc, and don't reregister
14122 14130   * the lock locally.
14123 14131   *
14124 14132   * NFS4_LCK_CTYPE_RESEND: same as NFS4_LCK_CTYPE_RECLAIM, with the addition
14125 14133   * that we will use the information passed in via resend_rqstp to setup the
14126 14134   * lock/locku request.  This resend is the exact same request as the 'lost
14127 14135   * lock', and is initiated by the recovery framework. A successful resend
14128 14136   * request can initiate one or more reinstate requests.
14129 14137   *
14130 14138   * NFS4_LCK_CTYPE_REINSTATE: same as NFS4_LCK_CTYPE_RESEND, except that it
14131 14139   * does not trigger additional reinstate requests.  This lock call type is
14132 14140   * set for setting the v4 server's locking state back to match what the
14133 14141   * client's local locking state is in the event of a received 'lost lock'.
14134 14142   *
14135 14143   * Errors are returned via the nfs4_error_t parameter.
14136 14144   */
14137 14145  void
14138 14146  nfs4frlock(nfs4_lock_call_type_t ctype, vnode_t *vp, int cmd, flock64_t *flk,
14139 14147      int flag, u_offset_t offset, cred_t *cr, nfs4_error_t *ep,
14140 14148      nfs4_lost_rqst_t *resend_rqstp, int *did_reclaimp)
14141 14149  {
14142 14150          COMPOUND4args_clnt      args, *argsp = NULL;
14143 14151          COMPOUND4res_clnt       res, *resp = NULL;
14144 14152          nfs_argop4      *argop;
14145 14153          nfs_resop4      *resop;
14146 14154          rnode4_t        *rp;
14147 14155          int             doqueue = 1;
14148 14156          clock_t         tick_delay;  /* delay in clock ticks */
14149 14157          struct lm_sysid *ls;
14150 14158          LOCK4args       *lock_args = NULL;
14151 14159          LOCKU4args      *locku_args = NULL;
14152 14160          LOCKT4args      *lockt_args = NULL;
14153 14161          nfs4_open_owner_t *oop = NULL;
14154 14162          nfs4_open_stream_t *osp = NULL;
14155 14163          nfs4_lock_owner_t *lop = NULL;
14156 14164          bool_t          needrecov = FALSE;
14157 14165          nfs4_recov_state_t recov_state;
14158 14166          short           whence;
14159 14167          nfs4_op_hint_t  op_hint;
14160 14168          nfs4_lost_rqst_t lost_rqst;
14161 14169          bool_t          retry = FALSE;
14162 14170          bool_t          did_start_fop = FALSE;
14163 14171          bool_t          skip_get_err = FALSE;
14164 14172          cred_t          *cred_otw = NULL;
14165 14173          bool_t          recovonly;      /* just queue request */
14166 14174          int             frc_no_reclaim = 0;
14167 14175  #ifdef DEBUG
14168 14176          char *name;
14169 14177  #endif
14170 14178  
14171 14179          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
14172 14180  
14173 14181  #ifdef DEBUG
14174 14182          name = fn_name(VTOSV(vp)->sv_name);
14175 14183          NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4frlock: "
14176 14184              "%s: cmd %d, type %d, offset %llu, start %"PRIx64", "
14177 14185              "length %"PRIu64", pid %d, sysid %d, call type %s, "
14178 14186              "resend request %s", name, cmd, flk->l_type, offset, flk->l_start,
14179 14187              flk->l_len, ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pid :
14180 14188              flk->l_pid, flk->l_sysid, nfs4frlock_get_call_type(ctype),
14181 14189              resend_rqstp ? "TRUE" : "FALSE"));
14182 14190          kmem_free(name, MAXNAMELEN);
14183 14191  #endif
14184 14192  
14185 14193          nfs4_error_zinit(ep);
14186 14194          ep->error = nfs4frlock_validate_args(cmd, flk, flag, vp, offset);
14187 14195          if (ep->error)
14188 14196                  return;
14189 14197          ep->error = nfs4frlock_get_sysid(&ls, vp, flk);
14190 14198          if (ep->error)
14191 14199                  return;
14192 14200          nfs4frlock_pre_setup(&tick_delay, &recov_state, flk, &whence,
14193 14201              vp, cr, &cred_otw);
14194 14202  
14195 14203  recov_retry:
14196 14204          nfs4frlock_call_init(&args, &argsp, &argop, &op_hint, flk, cmd,
14197 14205              &retry, &did_start_fop, &resp, &skip_get_err, &lost_rqst);
14198 14206          rp = VTOR4(vp);
14199 14207  
14200 14208          ep->error = nfs4frlock_start_call(ctype, vp, op_hint, &recov_state,
14201 14209              &did_start_fop, &recovonly);
14202 14210  
14203 14211          if (ep->error)
14204 14212                  goto out;
14205 14213  
14206 14214          if (recovonly) {
14207 14215                  /*
14208 14216                   * Leave the request for the recovery system to deal with.
14209 14217                   */
14210 14218                  ASSERT(ctype == NFS4_LCK_CTYPE_NORM);
14211 14219                  ASSERT(cmd != F_GETLK);
14212 14220                  ASSERT(flk->l_type == F_UNLCK);
14213 14221  
14214 14222                  nfs4_error_init(ep, EINTR);
14215 14223                  needrecov = TRUE;
14216 14224                  lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY);
14217 14225                  if (lop != NULL) {
14218 14226                          nfs4frlock_save_lost_rqst(ctype, ep->error, READ_LT,
14219 14227                              NULL, NULL, lop, flk, &lost_rqst, cr, vp);
14220 14228                          (void) nfs4_start_recovery(ep,
14221 14229                              VTOMI4(vp), vp, NULL, NULL,
14222 14230                              (lost_rqst.lr_op == OP_LOCK ||
14223 14231                              lost_rqst.lr_op == OP_LOCKU) ?
14224 14232                              &lost_rqst : NULL, OP_LOCKU, NULL, NULL, NULL);
14225 14233                          lock_owner_rele(lop);
14226 14234                          lop = NULL;
14227 14235                  }
14228 14236                  flk->l_pid = curproc->p_pid;
14229 14237                  nfs4_register_lock_locally(vp, flk, flag, offset);
14230 14238                  goto out;
14231 14239          }
14232 14240  
14233 14241          /* putfh directory fh */
14234 14242          argop[0].argop = OP_CPUTFH;
14235 14243          argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
14236 14244  
14237 14245          /*
14238 14246           * Set up the over-the-wire arguments and get references to the
14239 14247           * open owner, etc.
14240 14248           */
14241 14249  
14242 14250          if (ctype == NFS4_LCK_CTYPE_RESEND ||
14243 14251              ctype == NFS4_LCK_CTYPE_REINSTATE) {
14244 14252                  nfs4frlock_setup_resend_lock_args(resend_rqstp, argsp,
14245 14253                      &argop[1], &lop, &oop, &osp, &lock_args, &locku_args);
14246 14254          } else {
14247 14255                  bool_t go_otw = TRUE;
14248 14256  
14249 14257                  ASSERT(resend_rqstp == NULL);
14250 14258  
14251 14259                  switch (cmd) {
14252 14260                  case F_GETLK:
14253 14261                  case F_O_GETLK:
14254 14262                          nfs4frlock_setup_lockt_args(ctype, &argop[1],
14255 14263                              &lockt_args, argsp, flk, rp);
14256 14264                          break;
14257 14265                  case F_SETLKW:
14258 14266                  case F_SETLK:
14259 14267                          if (flk->l_type == F_UNLCK)
14260 14268                                  nfs4frlock_setup_locku_args(ctype,
14261 14269                                      &argop[1], &locku_args, flk,
14262 14270                                      &lop, ep, argsp,
14263 14271                                      vp, flag, offset, cr,
14264 14272                                      &skip_get_err, &go_otw);
14265 14273                          else
14266 14274                                  nfs4frlock_setup_lock_args(ctype,
14267 14275                                      &lock_args, &oop, &osp, &lop, &argop[1],
14268 14276                                      argsp, flk, cmd, vp, cr, ep);
14269 14277  
14270 14278                          if (ep->error)
14271 14279                                  goto out;
14272 14280  
14273 14281                          switch (ep->stat) {
14274 14282                          case NFS4_OK:
14275 14283                                  break;
14276 14284                          case NFS4ERR_DELAY:
14277 14285                                  /* recov thread never gets this error */
14278 14286                                  ASSERT(resend_rqstp == NULL);
14279 14287                                  ASSERT(did_start_fop);
14280 14288  
14281 14289                                  nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint,
14282 14290                                      &recov_state, TRUE);
14283 14291                                  did_start_fop = FALSE;
14284 14292                                  if (argop[1].argop == OP_LOCK)
14285 14293                                          nfs4args_lock_free(&argop[1]);
14286 14294                                  else if (argop[1].argop == OP_LOCKT)
14287 14295                                          nfs4args_lockt_free(&argop[1]);
14288 14296                                  kmem_free(argop, 2 * sizeof (nfs_argop4));
14289 14297                                  argsp = NULL;
14290 14298                                  goto recov_retry;
14291 14299                          default:
14292 14300                                  ep->error = EIO;
14293 14301                                  goto out;
14294 14302                          }
14295 14303                          break;
14296 14304                  default:
14297 14305                          NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14298 14306                              "nfs4_frlock: invalid cmd %d", cmd));
14299 14307                          ep->error = EINVAL;
14300 14308                          goto out;
14301 14309                  }
14302 14310  
14303 14311                  if (!go_otw)
14304 14312                          goto out;
14305 14313          }
14306 14314  
14307 14315          /* XXX should we use the local reclock as a cache ? */
14308 14316          /*
14309 14317           * Unregister the lock with the local locking code before
14310 14318           * contacting the server.  This avoids a potential race where
14311 14319           * another process gets notified that it has been granted a lock
14312 14320           * before we can unregister ourselves locally.
14313 14321           */
14314 14322          if ((cmd == F_SETLK || cmd == F_SETLKW) && flk->l_type == F_UNLCK) {
14315 14323                  if (ctype == NFS4_LCK_CTYPE_NORM)
14316 14324                          flk->l_pid = ttoproc(curthread)->p_pid;
14317 14325                  nfs4_register_lock_locally(vp, flk, flag, offset);
14318 14326          }
14319 14327  
14320 14328          /*
14321 14329           * Send the server the lock request.  Continually loop with a delay
14322 14330           * if get error NFS4ERR_DENIED (for blocking locks) or NFS4ERR_GRACE.
14323 14331           */
14324 14332          resp = &res;
14325 14333  
14326 14334          NFS4_DEBUG((nfs4_client_call_debug || nfs4_client_lock_debug),
14327 14335              (CE_NOTE,
14328 14336              "nfs4frlock: %s call, rp %s", needrecov ? "recov" : "first",
14329 14337              rnode4info(rp)));
14330 14338  
14331 14339          if (lock_args && frc_no_reclaim) {
14332 14340                  ASSERT(ctype == NFS4_LCK_CTYPE_RECLAIM);
14333 14341                  NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14334 14342                      "nfs4frlock: frc_no_reclaim: clearing reclaim"));
14335 14343                  lock_args->reclaim = FALSE;
14336 14344                  if (did_reclaimp)
14337 14345                          *did_reclaimp = 0;
14338 14346          }
14339 14347  
14340 14348          /*
14341 14349           * Do the OTW call.
14342 14350           */
14343 14351          rfs4call(VTOMI4(vp), argsp, resp, cred_otw, &doqueue, 0, ep);
14344 14352  
14345 14353          NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14346 14354              "nfs4frlock: error %d, status %d", ep->error, resp->status));
14347 14355  
14348 14356          needrecov = nfs4_needs_recovery(ep, TRUE, vp->v_vfsp);
14349 14357          NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14350 14358              "nfs4frlock: needrecov %d", needrecov));
14351 14359  
14352 14360          if (ep->error == 0 && nfs4_need_to_bump_seqid(resp))
14353 14361                  nfs4frlock_bump_seqid(lock_args, locku_args, oop, lop,
14354 14362                      args.ctag);
14355 14363  
14356 14364          /*
14357 14365           * Check if one of these mutually exclusive error cases has
14358 14366           * happened:
14359 14367           *   need to swap credentials due to access error
14360 14368           *   recovery is needed
14361 14369           *   different error (only known case is missing Kerberos ticket)
14362 14370           */
14363 14371  
14364 14372          if ((ep->error == EACCES ||
14365 14373              (ep->error == 0 && resp->status == NFS4ERR_ACCESS)) &&
14366 14374              cred_otw != cr) {
14367 14375                  nfs4frlock_check_access(vp, op_hint, &recov_state, needrecov,
14368 14376                      &did_start_fop, &argsp, &resp, ep->error, &lop, &oop, &osp,
14369 14377                      cr, &cred_otw);
14370 14378                  goto recov_retry;
14371 14379          }
14372 14380  
14373 14381          if (needrecov) {
14374 14382                  /*
14375 14383                   * LOCKT requests don't need to recover from lost
14376 14384                   * requests since they don't create/modify state.
14377 14385                   */
14378 14386                  if ((ep->error == EINTR ||
14379 14387                      NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp)) &&
14380 14388                      lockt_args)
14381 14389                          goto out;
14382 14390                  /*
14383 14391                   * Do not attempt recovery for requests initiated by
14384 14392                   * the recovery framework.  Let the framework redrive them.
14385 14393                   */
14386 14394                  if (ctype != NFS4_LCK_CTYPE_NORM)
14387 14395                          goto out;
14388 14396                  else {
14389 14397                          ASSERT(resend_rqstp == NULL);
14390 14398                  }
14391 14399  
14392 14400                  nfs4frlock_save_lost_rqst(ctype, ep->error,
14393 14401                      flk_to_locktype(cmd, flk->l_type),
14394 14402                      oop, osp, lop, flk, &lost_rqst, cred_otw, vp);
14395 14403  
14396 14404                  retry = nfs4frlock_recovery(needrecov, ep, &argsp,
14397 14405                      &resp, lock_args, locku_args, &oop, &osp, &lop,
14398 14406                      rp, vp, &recov_state, op_hint, &did_start_fop,
14399 14407                      cmd != F_GETLK ? &lost_rqst : NULL, flk);
14400 14408  
14401 14409                  if (retry) {
14402 14410                          ASSERT(oop == NULL);
14403 14411                          ASSERT(osp == NULL);
14404 14412                          ASSERT(lop == NULL);
14405 14413                          goto recov_retry;
14406 14414                  }
14407 14415                  goto out;
14408 14416          }
14409 14417  
14410 14418          /*
14411 14419           * Bail out if have reached this point with ep->error set. Can
14412 14420           * happen if (ep->error == EACCES && !needrecov && cred_otw == cr).
14413 14421           * This happens if Kerberos ticket has expired or has been
14414 14422           * destroyed.
14415 14423           */
14416 14424          if (ep->error != 0)
14417 14425                  goto out;
14418 14426  
14419 14427          /*
14420 14428           * Process the reply.
14421 14429           */
14422 14430          switch (resp->status) {
14423 14431          case NFS4_OK:
14424 14432                  resop = &resp->array[1];
14425 14433                  nfs4frlock_results_ok(ctype, cmd, flk, vp, flag, offset,
14426 14434                      resend_rqstp);
14427 14435                  /*
14428 14436                   * Have a successful lock operation, now update state.
14429 14437                   */
14430 14438                  nfs4frlock_update_state(lock_args, locku_args, lockt_args,
14431 14439                      resop, lop, vp, flk, cr, resend_rqstp);
14432 14440                  break;
14433 14441  
14434 14442          case NFS4ERR_DENIED:
14435 14443                  resop = &resp->array[1];
14436 14444                  retry = nfs4frlock_results_denied(ctype, lock_args, lockt_args,
14437 14445                      &oop, &osp, &lop, cmd, vp, flk, op_hint,
14438 14446                      &recov_state, needrecov, &argsp, &resp,
14439 14447                      &tick_delay, &whence, &ep->error, resop, cr,
14440 14448                      &did_start_fop, &skip_get_err);
14441 14449  
14442 14450                  if (retry) {
14443 14451                          ASSERT(oop == NULL);
14444 14452                          ASSERT(osp == NULL);
14445 14453                          ASSERT(lop == NULL);
14446 14454                          goto recov_retry;
14447 14455                  }
14448 14456                  break;
14449 14457          /*
14450 14458           * If the server won't let us reclaim, fall-back to trying to lock
14451 14459           * the file from scratch. Code elsewhere will check the changeinfo
14452 14460           * to ensure the file hasn't been changed.
14453 14461           */
14454 14462          case NFS4ERR_NO_GRACE:
14455 14463                  if (lock_args && lock_args->reclaim == TRUE) {
14456 14464                          ASSERT(ctype == NFS4_LCK_CTYPE_RECLAIM);
14457 14465                          NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14458 14466                              "nfs4frlock: reclaim: NFS4ERR_NO_GRACE"));
14459 14467                          frc_no_reclaim = 1;
14460 14468                          /* clean up before retrying */
14461 14469                          needrecov = 0;
14462 14470                          (void) nfs4frlock_recovery(needrecov, ep, &argsp, &resp,
14463 14471                              lock_args, locku_args, &oop, &osp, &lop, rp, vp,
14464 14472                              &recov_state, op_hint, &did_start_fop, NULL, flk);
14465 14473                          goto recov_retry;
14466 14474                  }
14467 14475                  /* FALLTHROUGH */
14468 14476  
14469 14477          default:
14470 14478                  nfs4frlock_results_default(resp, &ep->error);
14471 14479                  break;
14472 14480          }
14473 14481  out:
14474 14482          /*
14475 14483           * Process and cleanup from error.  Make interrupted unlock
14476 14484           * requests look successful, since they will be handled by the
14477 14485           * client recovery code.
14478 14486           */
14479 14487          nfs4frlock_final_cleanup(ctype, argsp, resp, vp, op_hint, &recov_state,
14480 14488              needrecov, oop, osp, lop, flk, whence, offset, ls, &ep->error,
14481 14489              lock_args, locku_args, did_start_fop,
14482 14490              skip_get_err, cred_otw, cr);
14483 14491  
14484 14492          if (ep->error == EINTR && flk->l_type == F_UNLCK &&
14485 14493              (cmd == F_SETLK || cmd == F_SETLKW))
14486 14494                  ep->error = 0;
14487 14495  }
14488 14496  
14489 14497  /*
14490 14498   * nfs4_safelock:
14491 14499   *
14492 14500   * Return non-zero if the given lock request can be handled without
14493 14501   * violating the constraints on concurrent mapping and locking.
14494 14502   */
14495 14503  
14496 14504  static int
14497 14505  nfs4_safelock(vnode_t *vp, const struct flock64 *bfp, cred_t *cr)
14498 14506  {
14499 14507          rnode4_t *rp = VTOR4(vp);
14500 14508          struct vattr va;
14501 14509          int error;
14502 14510  
14503 14511          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
14504 14512          ASSERT(rp->r_mapcnt >= 0);
14505 14513          NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock %s: "
14506 14514              "(%"PRIx64", %"PRIx64"); mapcnt = %ld", bfp->l_type == F_WRLCK ?
14507 14515              "write" : bfp->l_type == F_RDLCK ? "read" : "unlock",
14508 14516              bfp->l_start, bfp->l_len, rp->r_mapcnt));
14509 14517  
14510 14518          if (rp->r_mapcnt == 0)
14511 14519                  return (1);             /* always safe if not mapped */
14512 14520  
14513 14521          /*
14514 14522           * If the file is already mapped and there are locks, then they
14515 14523           * should be all safe locks.  So adding or removing a lock is safe
14516 14524           * as long as the new request is safe (i.e., whole-file, meaning
14517 14525           * length and starting offset are both zero).
14518 14526           */
14519 14527  
14520 14528          if (bfp->l_start != 0 || bfp->l_len != 0) {
14521 14529                  NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: "
14522 14530                      "cannot lock a memory mapped file unless locking the "
14523 14531                      "entire file: start %"PRIx64", len %"PRIx64,
14524 14532                      bfp->l_start, bfp->l_len));
14525 14533                  return (0);
14526 14534          }
14527 14535  
14528 14536          /* mandatory locking and mapping don't mix */
14529 14537          va.va_mask = AT_MODE;
14530 14538          error = VOP_GETATTR(vp, &va, 0, cr, NULL);
14531 14539          if (error != 0) {
14532 14540                  NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: "
14533 14541                      "getattr error %d", error));
14534 14542                  return (0);             /* treat errors conservatively */
14535 14543          }
14536 14544          if (MANDLOCK(vp, va.va_mode)) {
14537 14545                  NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: "
14538 14546                      "cannot mandatory lock and mmap a file"));
14539 14547                  return (0);
14540 14548          }
14541 14549  
14542 14550          return (1);
14543 14551  }
14544 14552  
14545 14553  
14546 14554  /*
14547 14555   * Register the lock locally within Solaris.
14548 14556   * As the client, we "or" the sysid with LM_SYSID_CLIENT when
14549 14557   * recording locks locally.
14550 14558   *
14551 14559   * This should handle conflicts/cooperation with NFS v2/v3 since all locks
14552 14560   * are registered locally.
14553 14561   */
14554 14562  void
14555 14563  nfs4_register_lock_locally(vnode_t *vp, struct flock64 *flk, int flag,
14556 14564      u_offset_t offset)
14557 14565  {
14558 14566          int oldsysid;
14559 14567          int error;
14560 14568  #ifdef DEBUG
14561 14569          char *name;
14562 14570  #endif
14563 14571  
14564 14572          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
14565 14573  
14566 14574  #ifdef DEBUG
14567 14575          name = fn_name(VTOSV(vp)->sv_name);
14568 14576          NFS4_DEBUG(nfs4_client_lock_debug,
14569 14577              (CE_NOTE, "nfs4_register_lock_locally: %s: type %d, "
14570 14578              "start %"PRIx64", length %"PRIx64", pid %ld, sysid %d",
14571 14579              name, flk->l_type, flk->l_start, flk->l_len, (long)flk->l_pid,
14572 14580              flk->l_sysid));
14573 14581          kmem_free(name, MAXNAMELEN);
14574 14582  #endif
14575 14583  
14576 14584          /* register the lock with local locking */
14577 14585          oldsysid = flk->l_sysid;
14578 14586          flk->l_sysid |= LM_SYSID_CLIENT;
14579 14587          error = reclock(vp, flk, SETFLCK, flag, offset, NULL);
14580 14588  #ifdef DEBUG
14581 14589          if (error != 0) {
14582 14590                  NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14583 14591                      "nfs4_register_lock_locally: could not register with"
14584 14592                      " local locking"));
14585 14593                  NFS4_DEBUG(nfs4_client_lock_debug, (CE_CONT,
14586 14594                      "error %d, vp 0x%p, pid %d, sysid 0x%x",
14587 14595                      error, (void *)vp, flk->l_pid, flk->l_sysid));
14588 14596                  NFS4_DEBUG(nfs4_client_lock_debug, (CE_CONT,
14589 14597                      "type %d off 0x%" PRIx64 " len 0x%" PRIx64,
14590 14598                      flk->l_type, flk->l_start, flk->l_len));
14591 14599                  (void) reclock(vp, flk, 0, flag, offset, NULL);
14592 14600                  NFS4_DEBUG(nfs4_client_lock_debug, (CE_CONT,
14593 14601                      "blocked by pid %d sysid 0x%x type %d "
14594 14602                      "off 0x%" PRIx64 " len 0x%" PRIx64,
14595 14603                      flk->l_pid, flk->l_sysid, flk->l_type, flk->l_start,
14596 14604                      flk->l_len));
14597 14605          }
14598 14606  #endif
14599 14607          flk->l_sysid = oldsysid;
14600 14608  }
14601 14609  
14602 14610  /*
14603 14611   * nfs4_lockrelease:
14604 14612   *
14605 14613   * Release any locks on the given vnode that are held by the current
14606 14614   * process.  Also removes the lock owner (if one exists) from the rnode's
14607 14615   * list.
14608 14616   */
14609 14617  static int
14610 14618  nfs4_lockrelease(vnode_t *vp, int flag, offset_t offset, cred_t *cr)
14611 14619  {
14612 14620          flock64_t ld;
14613 14621          int ret, error;
14614 14622          rnode4_t *rp;
14615 14623          nfs4_lock_owner_t *lop;
14616 14624          nfs4_recov_state_t recov_state;
14617 14625          mntinfo4_t *mi;
14618 14626          bool_t possible_orphan = FALSE;
14619 14627          bool_t recovonly;
14620 14628  
14621 14629          ASSERT((uintptr_t)vp > KERNELBASE);
14622 14630          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
14623 14631  
14624 14632          rp = VTOR4(vp);
14625 14633          mi = VTOMI4(vp);
14626 14634  
14627 14635          /*
14628 14636           * If we have not locked anything then we can
14629 14637           * just return since we have no work to do.
14630 14638           */
14631 14639          if (rp->r_lo_head.lo_next_rnode == &rp->r_lo_head) {
14632 14640                  return (0);
14633 14641          }
14634 14642  
14635 14643          /*
14636 14644           * We need to comprehend that another thread may
14637 14645           * kick off recovery and the lock_owner we have stashed
14638 14646           * in lop might be invalid so we should NOT cache it
14639 14647           * locally!
14640 14648           */
14641 14649          recov_state.rs_flags = 0;
14642 14650          recov_state.rs_num_retry_despite_err = 0;
14643 14651          error = nfs4_start_fop(mi, vp, NULL, OH_LOCKU, &recov_state,
14644 14652              &recovonly);
14645 14653          if (error) {
14646 14654                  mutex_enter(&rp->r_statelock);
14647 14655                  rp->r_flags |= R4LODANGLERS;
14648 14656                  mutex_exit(&rp->r_statelock);
14649 14657                  return (error);
14650 14658          }
14651 14659  
14652 14660          lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY);
14653 14661  
14654 14662          /*
14655 14663           * Check if the lock owner might have a lock (request was sent but
14656 14664           * no response was received).  Also check if there are any remote
14657 14665           * locks on the file.  (In theory we shouldn't have to make this
14658 14666           * second check if there's no lock owner, but for now we'll be
14659 14667           * conservative and do it anyway.)  If either condition is true,
14660 14668           * send an unlock for the entire file to the server.
14661 14669           *
14662 14670           * Note that no explicit synchronization is needed here.  At worst,
14663 14671           * flk_has_remote_locks() will return a false positive, in which case
14664 14672           * the unlock call wastes time but doesn't harm correctness.
14665 14673           */
14666 14674  
14667 14675          if (lop) {
14668 14676                  mutex_enter(&lop->lo_lock);
14669 14677                  possible_orphan = lop->lo_pending_rqsts;
14670 14678                  mutex_exit(&lop->lo_lock);
14671 14679                  lock_owner_rele(lop);
14672 14680          }
14673 14681  
14674 14682          nfs4_end_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 0);
14675 14683  
14676 14684          NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14677 14685              "nfs4_lockrelease: possible orphan %d, remote locks %d, for "
14678 14686              "lop %p.", possible_orphan, flk_has_remote_locks(vp),
14679 14687              (void *)lop));
14680 14688  
14681 14689          if (possible_orphan || flk_has_remote_locks(vp)) {
14682 14690                  ld.l_type = F_UNLCK;    /* set to unlock entire file */
14683 14691                  ld.l_whence = 0;        /* unlock from start of file */
14684 14692                  ld.l_start = 0;
14685 14693                  ld.l_len = 0;           /* do entire file */
14686 14694  
14687 14695                  ret = VOP_FRLOCK(vp, F_SETLK, &ld, flag, offset, NULL,
14688 14696                      cr, NULL);
14689 14697  
14690 14698                  if (ret != 0) {
14691 14699                          /*
14692 14700                           * If VOP_FRLOCK fails, make sure we unregister
14693 14701                           * local locks before we continue.
14694 14702                           */
14695 14703                          ld.l_pid = ttoproc(curthread)->p_pid;
14696 14704                          nfs4_register_lock_locally(vp, &ld, flag, offset);
14697 14705                          NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14698 14706                              "nfs4_lockrelease: lock release error on vp"
14699 14707                              " %p: error %d.\n", (void *)vp, ret));
14700 14708                  }
14701 14709          }
14702 14710  
14703 14711          recov_state.rs_flags = 0;
14704 14712          recov_state.rs_num_retry_despite_err = 0;
14705 14713          error = nfs4_start_fop(mi, vp, NULL, OH_LOCKU, &recov_state,
14706 14714              &recovonly);
14707 14715          if (error) {
14708 14716                  mutex_enter(&rp->r_statelock);
14709 14717                  rp->r_flags |= R4LODANGLERS;
14710 14718                  mutex_exit(&rp->r_statelock);
14711 14719                  return (error);
14712 14720          }
14713 14721  
14714 14722          /*
14715 14723           * So, here we're going to need to retrieve the lock-owner
14716 14724           * again (in case recovery has done a switch-a-roo) and
14717 14725           * remove it because we can.
14718 14726           */
14719 14727          lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY);
14720 14728  
14721 14729          if (lop) {
14722 14730                  nfs4_rnode_remove_lock_owner(rp, lop);
14723 14731                  lock_owner_rele(lop);
14724 14732          }
14725 14733  
14726 14734          nfs4_end_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 0);
14727 14735          return (0);
14728 14736  }
14729 14737  
14730 14738  /*
14731 14739   * Wait for 'tick_delay' clock ticks.
14732 14740   * Implement exponential backoff until hit the lease_time of this nfs4_server.
14733 14741   * NOTE: lock_lease_time is in seconds.
14734 14742   *
14735 14743   * XXX For future improvements, should implement a waiting queue scheme.
14736 14744   */
14737 14745  static int
14738 14746  nfs4_block_and_wait(clock_t *tick_delay, rnode4_t *rp)
14739 14747  {
14740 14748          long milliseconds_delay;
14741 14749          time_t lock_lease_time;
14742 14750  
14743 14751          /* wait tick_delay clock ticks or siginteruptus */
14744 14752          if (delay_sig(*tick_delay)) {
14745 14753                  return (EINTR);
14746 14754          }
14747 14755          NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_block_and_wait: "
14748 14756              "reissue the lock request: blocked for %ld clock ticks: %ld "
14749 14757              "milliseconds", *tick_delay, drv_hztousec(*tick_delay) / 1000));
14750 14758  
14751 14759          /* get the lease time */
14752 14760          lock_lease_time = r2lease_time(rp);
14753 14761  
14754 14762          /* drv_hztousec converts ticks to microseconds */
14755 14763          milliseconds_delay = drv_hztousec(*tick_delay) / 1000;
14756 14764          if (milliseconds_delay < lock_lease_time * 1000) {
14757 14765                  *tick_delay = 2 * *tick_delay;
14758 14766                  if (drv_hztousec(*tick_delay) > lock_lease_time * 1000 * 1000)
14759 14767                          *tick_delay = drv_usectohz(lock_lease_time*1000*1000);
14760 14768          }
14761 14769          return (0);
14762 14770  }
14763 14771  
14764 14772  
14765 14773  void
14766 14774  nfs4_vnops_init(void)
14767 14775  {
14768 14776  }
14769 14777  
14770 14778  void
14771 14779  nfs4_vnops_fini(void)
14772 14780  {
14773 14781  }
14774 14782  
14775 14783  /*
14776 14784   * Return a reference to the directory (parent) vnode for a given vnode,
14777 14785   * using the saved pathname information and the directory file handle.  The
14778 14786   * caller is responsible for disposing of the reference.
14779 14787   * Returns zero or an errno value.
14780 14788   *
14781 14789   * Caller should set need_start_op to FALSE if it is the recovery
14782 14790   * thread, or if a start_fop has already been done.  Otherwise, TRUE.
14783 14791   */
14784 14792  int
14785 14793  vtodv(vnode_t *vp, vnode_t **dvpp, cred_t *cr, bool_t need_start_op)
14786 14794  {
14787 14795          svnode_t *svnp;
14788 14796          vnode_t *dvp = NULL;
14789 14797          servinfo4_t *svp;
14790 14798          nfs4_fname_t *mfname;
14791 14799          int error;
14792 14800  
14793 14801          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
14794 14802  
14795 14803          if (vp->v_flag & VROOT) {
14796 14804                  nfs4_sharedfh_t *sfh;
14797 14805                  nfs_fh4 fh;
14798 14806                  mntinfo4_t *mi;
14799 14807  
14800 14808                  ASSERT(vp->v_type == VREG);
14801 14809  
14802 14810                  mi = VTOMI4(vp);
14803 14811                  svp = mi->mi_curr_serv;
14804 14812                  (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
14805 14813                  fh.nfs_fh4_len = svp->sv_pfhandle.fh_len;
14806 14814                  fh.nfs_fh4_val = svp->sv_pfhandle.fh_buf;
14807 14815                  sfh = sfh4_get(&fh, VTOMI4(vp));
14808 14816                  nfs_rw_exit(&svp->sv_lock);
14809 14817                  mfname = mi->mi_fname;
14810 14818                  fn_hold(mfname);
14811 14819                  dvp = makenfs4node_by_fh(sfh, NULL, &mfname, NULL, mi, cr, 0);
14812 14820                  sfh4_rele(&sfh);
14813 14821  
14814 14822                  if (dvp->v_type == VNON)
14815 14823                          dvp->v_type = VDIR;
14816 14824                  *dvpp = dvp;
14817 14825                  return (0);
14818 14826          }
14819 14827  
14820 14828          svnp = VTOSV(vp);
14821 14829  
14822 14830          if (svnp == NULL) {
14823 14831                  NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: "
14824 14832                      "shadow node is NULL"));
14825 14833                  return (EINVAL);
14826 14834          }
14827 14835  
14828 14836          if (svnp->sv_name == NULL || svnp->sv_dfh == NULL) {
14829 14837                  NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: "
14830 14838                      "shadow node name or dfh val == NULL"));
14831 14839                  return (EINVAL);
14832 14840          }
14833 14841  
14834 14842          error = nfs4_make_dotdot(svnp->sv_dfh, 0, vp, cr, &dvp,
14835 14843              (int)need_start_op);
14836 14844          if (error != 0) {
14837 14845                  NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: "
14838 14846                      "nfs4_make_dotdot returned %d", error));
14839 14847                  return (error);
14840 14848          }
14841 14849          if (!dvp) {
14842 14850                  NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: "
14843 14851                      "nfs4_make_dotdot returned a NULL dvp"));
14844 14852                  return (EIO);
14845 14853          }
14846 14854          if (dvp->v_type == VNON)
14847 14855                  dvp->v_type = VDIR;
14848 14856          ASSERT(dvp->v_type == VDIR);
14849 14857          if (VTOR4(vp)->r_flags & R4ISXATTR) {
14850 14858                  mutex_enter(&dvp->v_lock);
14851 14859                  dvp->v_flag |= V_XATTRDIR;
14852 14860                  mutex_exit(&dvp->v_lock);
14853 14861          }
14854 14862          *dvpp = dvp;
14855 14863          return (0);
14856 14864  }
14857 14865  
14858 14866  /*
14859 14867   * Copy the (final) component name of vp to fnamep.  maxlen is the maximum
14860 14868   * length that fnamep can accept, including the trailing null.
14861 14869   * Returns 0 if okay, returns an errno value if there was a problem.
14862 14870   */
14863 14871  
14864 14872  int
14865 14873  vtoname(vnode_t *vp, char *fnamep, ssize_t maxlen)
14866 14874  {
14867 14875          char *fn;
14868 14876          int err = 0;
14869 14877          servinfo4_t *svp;
14870 14878          svnode_t *shvp;
14871 14879  
14872 14880          /*
14873 14881           * If the file being opened has VROOT set, then this is
14874 14882           * a "file" mount.  sv_name will not be interesting, so
14875 14883           * go back to the servinfo4 to get the original mount
14876 14884           * path and strip off all but the final edge.  Otherwise
14877 14885           * just return the name from the shadow vnode.
14878 14886           */
14879 14887  
14880 14888          if (vp->v_flag & VROOT) {
14881 14889  
14882 14890                  svp = VTOMI4(vp)->mi_curr_serv;
14883 14891                  (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
14884 14892  
14885 14893                  fn = strrchr(svp->sv_path, '/');
14886 14894                  if (fn == NULL)
14887 14895                          err = EINVAL;
14888 14896                  else
14889 14897                          fn++;
14890 14898          } else {
14891 14899                  shvp = VTOSV(vp);
14892 14900                  fn = fn_name(shvp->sv_name);
14893 14901          }
14894 14902  
14895 14903          if (err == 0)
14896 14904                  if (strlen(fn) < maxlen)
14897 14905                          (void) strcpy(fnamep, fn);
14898 14906                  else
14899 14907                          err = ENAMETOOLONG;
14900 14908  
14901 14909          if (vp->v_flag & VROOT)
14902 14910                  nfs_rw_exit(&svp->sv_lock);
14903 14911          else
14904 14912                  kmem_free(fn, MAXNAMELEN);
14905 14913  
14906 14914          return (err);
14907 14915  }
14908 14916  
14909 14917  /*
14910 14918   * Bookkeeping for a close that doesn't need to go over the wire.
14911 14919   * *have_lockp is set to 0 if 'os_sync_lock' is released; otherwise
14912 14920   * it is left at 1.
14913 14921   */
14914 14922  void
14915 14923  nfs4close_notw(vnode_t *vp, nfs4_open_stream_t *osp, int *have_lockp)
14916 14924  {
14917 14925          rnode4_t                *rp;
14918 14926          mntinfo4_t              *mi;
14919 14927  
14920 14928          mi = VTOMI4(vp);
14921 14929          rp = VTOR4(vp);
14922 14930  
14923 14931          NFS4_DEBUG(nfs4close_notw_debug, (CE_NOTE, "nfs4close_notw: "
14924 14932              "rp=%p osp=%p", (void *)rp, (void *)osp));
14925 14933          ASSERT(nfs_zone() == mi->mi_zone);
14926 14934          ASSERT(mutex_owned(&osp->os_sync_lock));
14927 14935          ASSERT(*have_lockp);
14928 14936  
14929 14937          if (!osp->os_valid ||
14930 14938              osp->os_open_ref_count > 0 || osp->os_mapcnt > 0) {
14931 14939                  return;
14932 14940          }
14933 14941  
14934 14942          /*
14935 14943           * This removes the reference obtained at OPEN; ie,
14936 14944           * when the open stream structure was created.
14937 14945           *
14938 14946           * We don't have to worry about calling 'open_stream_rele'
14939 14947           * since we our currently holding a reference to this
14940 14948           * open stream which means the count can not go to 0 with
14941 14949           * this decrement.
14942 14950           */
14943 14951          ASSERT(osp->os_ref_count >= 2);
14944 14952          osp->os_ref_count--;
14945 14953          osp->os_valid = 0;
14946 14954          mutex_exit(&osp->os_sync_lock);
14947 14955          *have_lockp = 0;
14948 14956  
14949 14957          nfs4_dec_state_ref_count(mi);
14950 14958  }
14951 14959  
14952 14960  /*
14953 14961   * Close all remaining open streams on the rnode.  These open streams
14954 14962   * could be here because:
14955 14963   * - The close attempted at either close or delmap failed
14956 14964   * - Some kernel entity did VOP_OPEN but never did VOP_CLOSE
14957 14965   * - Someone did mknod on a regular file but never opened it
14958 14966   */
14959 14967  int
14960 14968  nfs4close_all(vnode_t *vp, cred_t *cr)
14961 14969  {
14962 14970          nfs4_open_stream_t *osp;
14963 14971          int error;
14964 14972          nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
14965 14973          rnode4_t *rp;
14966 14974  
14967 14975          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
14968 14976  
14969 14977          error = 0;
14970 14978          rp = VTOR4(vp);
14971 14979  
14972 14980          /*
14973 14981           * At this point, all we know is that the last time
14974 14982           * someone called vn_rele, the count was 1.  Since then,
14975 14983           * the vnode could have been re-activated.  We want to
14976 14984           * loop through the open streams and close each one, but
14977 14985           * we have to be careful since once we release the rnode
14978 14986           * hash bucket lock, someone else is free to come in and
14979 14987           * re-activate the rnode and add new open streams.  The
14980 14988           * strategy is take the rnode hash bucket lock, verify that
14981 14989           * the count is still 1, grab the open stream off the
14982 14990           * head of the list and mark it invalid, then release the
14983 14991           * rnode hash bucket lock and proceed with that open stream.
14984 14992           * This is ok because nfs4close_one() will acquire the proper
14985 14993           * open/create to close/destroy synchronization for open
14986 14994           * streams, and will ensure that if someone has reopened
14987 14995           * the open stream after we've dropped the hash bucket lock
14988 14996           * then we'll just simply return without destroying the
14989 14997           * open stream.
14990 14998           * Repeat until the list is empty.
14991 14999           */
14992 15000  
14993 15001          for (;;) {
14994 15002  
14995 15003                  /* make sure vnode hasn't been reactivated */
14996 15004                  rw_enter(&rp->r_hashq->r_lock, RW_READER);
14997 15005                  mutex_enter(&vp->v_lock);
14998 15006                  if (vp->v_count > 1) {
14999 15007                          mutex_exit(&vp->v_lock);
15000 15008                          rw_exit(&rp->r_hashq->r_lock);
15001 15009                          break;
15002 15010                  }
15003 15011                  /*
15004 15012                   * Grabbing r_os_lock before releasing v_lock prevents
15005 15013                   * a window where the rnode/open stream could get
15006 15014                   * reactivated (and os_force_close set to 0) before we
15007 15015                   * had a chance to set os_force_close to 1.
15008 15016                   */
15009 15017                  mutex_enter(&rp->r_os_lock);
15010 15018                  mutex_exit(&vp->v_lock);
15011 15019  
15012 15020                  osp = list_head(&rp->r_open_streams);
15013 15021                  if (!osp) {
15014 15022                          /* nothing left to CLOSE OTW, so return */
15015 15023                          mutex_exit(&rp->r_os_lock);
15016 15024                          rw_exit(&rp->r_hashq->r_lock);
15017 15025                          break;
15018 15026                  }
15019 15027  
15020 15028                  mutex_enter(&rp->r_statev4_lock);
15021 15029                  /* the file can't still be mem mapped */
15022 15030                  ASSERT(rp->r_mapcnt == 0);
15023 15031                  if (rp->created_v4)
15024 15032                          rp->created_v4 = 0;
15025 15033                  mutex_exit(&rp->r_statev4_lock);
15026 15034  
15027 15035                  /*
15028 15036                   * Grab a ref on this open stream; nfs4close_one
15029 15037                   * will mark it as invalid
15030 15038                   */
15031 15039                  mutex_enter(&osp->os_sync_lock);
15032 15040                  osp->os_ref_count++;
15033 15041                  osp->os_force_close = 1;
15034 15042                  mutex_exit(&osp->os_sync_lock);
15035 15043                  mutex_exit(&rp->r_os_lock);
15036 15044                  rw_exit(&rp->r_hashq->r_lock);
15037 15045  
15038 15046                  nfs4close_one(vp, osp, cr, 0, NULL, &e, CLOSE_FORCE, 0, 0, 0);
15039 15047  
15040 15048                  /* Update error if it isn't already non-zero */
15041 15049                  if (error == 0) {
15042 15050                          if (e.error)
15043 15051                                  error = e.error;
15044 15052                          else if (e.stat)
15045 15053                                  error = geterrno4(e.stat);
15046 15054                  }
15047 15055  
15048 15056  #ifdef  DEBUG
15049 15057                  nfs4close_all_cnt++;
15050 15058  #endif
15051 15059                  /* Release the ref on osp acquired above. */
15052 15060                  open_stream_rele(osp, rp);
15053 15061  
15054 15062                  /* Proceed to the next open stream, if any */
15055 15063          }
15056 15064          return (error);
15057 15065  }
15058 15066  
15059 15067  /*
15060 15068   * nfs4close_one - close one open stream for a file if needed.
15061 15069   *
15062 15070   * "close_type" indicates which close path this is:
15063 15071   * CLOSE_NORM: close initiated via VOP_CLOSE.
15064 15072   * CLOSE_DELMAP: close initiated via VOP_DELMAP.
15065 15073   * CLOSE_FORCE: close initiated via VOP_INACTIVE.  This path forces
15066 15074   *      the close and release of client state for this open stream
15067 15075   *      (unless someone else has the open stream open).
15068 15076   * CLOSE_RESEND: indicates the request is a replay of an earlier request
15069 15077   *      (e.g., due to abort because of a signal).
15070 15078   * CLOSE_AFTER_RESEND: close initiated to "undo" a successful resent OPEN.
15071 15079   *
15072 15080   * CLOSE_RESEND and CLOSE_AFTER_RESEND will not attempt to retry after client
15073 15081   * recovery.  Instead, the caller is expected to deal with retries.
15074 15082   *
15075 15083   * The caller can either pass in the osp ('provided_osp') or not.
15076 15084   *
15077 15085   * 'access_bits' represents the access we are closing/downgrading.
15078 15086   *
15079 15087   * 'len', 'prot', and 'mmap_flags' are used for CLOSE_DELMAP.  'len' is the
15080 15088   * number of bytes we are unmapping, 'maxprot' is the mmap protection, and
15081 15089   * 'mmap_flags' tells us the type of sharing (MAP_PRIVATE or MAP_SHARED).
15082 15090   *
15083 15091   * Errors are returned via the nfs4_error_t.
15084 15092   */
15085 15093  void
15086 15094  nfs4close_one(vnode_t *vp, nfs4_open_stream_t *provided_osp, cred_t *cr,
15087 15095      int access_bits, nfs4_lost_rqst_t *lrp, nfs4_error_t *ep,
15088 15096      nfs4_close_type_t close_type, size_t len, uint_t maxprot,
15089 15097      uint_t mmap_flags)
15090 15098  {
15091 15099          nfs4_open_owner_t *oop;
15092 15100          nfs4_open_stream_t *osp = NULL;
15093 15101          int retry = 0;
15094 15102          int num_retries = NFS4_NUM_RECOV_RETRIES;
15095 15103          rnode4_t *rp;
15096 15104          mntinfo4_t *mi;
15097 15105          nfs4_recov_state_t recov_state;
15098 15106          cred_t *cred_otw = NULL;
15099 15107          bool_t recovonly = FALSE;
15100 15108          int isrecov;
15101 15109          int force_close;
15102 15110          int close_failed = 0;
15103 15111          int did_dec_count = 0;
15104 15112          int did_start_op = 0;
15105 15113          int did_force_recovlock = 0;
15106 15114          int did_start_seqid_sync = 0;
15107 15115          int have_sync_lock = 0;
15108 15116  
15109 15117          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
15110 15118  
15111 15119          NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE, "closing vp %p osp %p, "
15112 15120              "lrp %p, close type %d len %ld prot %x mmap flags %x bits %x",
15113 15121              (void *)vp, (void *)provided_osp, (void *)lrp, close_type,
15114 15122              len, maxprot, mmap_flags, access_bits));
15115 15123  
15116 15124          nfs4_error_zinit(ep);
15117 15125          rp = VTOR4(vp);
15118 15126          mi = VTOMI4(vp);
15119 15127          isrecov = (close_type == CLOSE_RESEND ||
15120 15128              close_type == CLOSE_AFTER_RESEND);
15121 15129  
15122 15130          /*
15123 15131           * First get the open owner.
15124 15132           */
15125 15133          if (!provided_osp) {
15126 15134                  oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
15127 15135          } else {
15128 15136                  oop = provided_osp->os_open_owner;
15129 15137                  ASSERT(oop != NULL);
15130 15138                  open_owner_hold(oop);
15131 15139          }
15132 15140  
15133 15141          if (!oop) {
15134 15142                  NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
15135 15143                      "nfs4close_one: no oop, rp %p, mi %p, cr %p, osp %p, "
15136 15144                      "close type %d", (void *)rp, (void *)mi, (void *)cr,
15137 15145                      (void *)provided_osp, close_type));
15138 15146                  ep->error = EIO;
15139 15147                  goto out;
15140 15148          }
15141 15149  
15142 15150          cred_otw = nfs4_get_otw_cred(cr, mi, oop);
15143 15151  recov_retry:
15144 15152          osp = NULL;
15145 15153          close_failed = 0;
15146 15154          force_close = (close_type == CLOSE_FORCE);
15147 15155          retry = 0;
15148 15156          did_start_op = 0;
15149 15157          did_force_recovlock = 0;
15150 15158          did_start_seqid_sync = 0;
15151 15159          have_sync_lock = 0;
15152 15160          recovonly = FALSE;
15153 15161          recov_state.rs_flags = 0;
15154 15162          recov_state.rs_num_retry_despite_err = 0;
15155 15163  
15156 15164          /*
15157 15165           * Second synchronize with recovery.
15158 15166           */
15159 15167          if (!isrecov) {
15160 15168                  ep->error = nfs4_start_fop(mi, vp, NULL, OH_CLOSE,
15161 15169                      &recov_state, &recovonly);
15162 15170                  if (!ep->error) {
15163 15171                          did_start_op = 1;
15164 15172                  } else {
15165 15173                          close_failed = 1;
15166 15174                          /*
15167 15175                           * If we couldn't get start_fop, but have to
15168 15176                           * cleanup state, then at least acquire the
15169 15177                           * mi_recovlock so we can synchronize with
15170 15178                           * recovery.
15171 15179                           */
15172 15180                          if (close_type == CLOSE_FORCE) {
15173 15181                                  (void) nfs_rw_enter_sig(&mi->mi_recovlock,
15174 15182                                      RW_READER, FALSE);
15175 15183                                  did_force_recovlock = 1;
15176 15184                          } else
15177 15185                                  goto out;
15178 15186                  }
15179 15187          }
15180 15188  
15181 15189          /*
15182 15190           * We cannot attempt to get the open seqid sync if nfs4_start_fop
15183 15191           * set 'recovonly' to TRUE since most likely this is due to
15184 15192           * reovery being active (MI4_RECOV_ACTIV).  If recovery is active,
15185 15193           * nfs4_start_open_seqid_sync() will fail with EAGAIN asking us
15186 15194           * to retry, causing us to loop until recovery finishes.  Plus we
15187 15195           * don't need protection over the open seqid since we're not going
15188 15196           * OTW, hence don't need to use the seqid.
15189 15197           */
15190 15198          if (recovonly == FALSE) {
15191 15199                  /* need to grab the open owner sync before 'os_sync_lock' */
15192 15200                  ep->error = nfs4_start_open_seqid_sync(oop, mi);
15193 15201                  if (ep->error == EAGAIN) {
15194 15202                          ASSERT(!isrecov);
15195 15203                          if (did_start_op)
15196 15204                                  nfs4_end_fop(mi, vp, NULL, OH_CLOSE,
15197 15205                                      &recov_state, TRUE);
15198 15206                          if (did_force_recovlock)
15199 15207                                  nfs_rw_exit(&mi->mi_recovlock);
15200 15208                          goto recov_retry;
15201 15209                  }
15202 15210                  did_start_seqid_sync = 1;
15203 15211          }
15204 15212  
15205 15213          /*
15206 15214           * Third get an open stream and acquire 'os_sync_lock' to
15207 15215           * sychronize the opening/creating of an open stream with the
15208 15216           * closing/destroying of an open stream.
15209 15217           */
15210 15218          if (!provided_osp) {
15211 15219                  /* returns with 'os_sync_lock' held */
15212 15220                  osp = find_open_stream(oop, rp);
15213 15221                  if (!osp) {
15214 15222                          ep->error = EIO;
15215 15223                          goto out;
15216 15224                  }
15217 15225          } else {
15218 15226                  osp = provided_osp;
15219 15227                  open_stream_hold(osp);
15220 15228                  mutex_enter(&osp->os_sync_lock);
15221 15229          }
15222 15230          have_sync_lock = 1;
15223 15231  
15224 15232          ASSERT(oop == osp->os_open_owner);
15225 15233  
15226 15234          /*
15227 15235           * Fourth, do any special pre-OTW CLOSE processing
15228 15236           * based on the specific close type.
15229 15237           */
15230 15238          if ((close_type == CLOSE_NORM || close_type == CLOSE_AFTER_RESEND) &&
15231 15239              !did_dec_count) {
15232 15240                  ASSERT(osp->os_open_ref_count > 0);
15233 15241                  osp->os_open_ref_count--;
15234 15242                  did_dec_count = 1;
15235 15243                  if (osp->os_open_ref_count == 0)
15236 15244                          osp->os_final_close = 1;
15237 15245          }
15238 15246  
15239 15247          if (close_type == CLOSE_FORCE) {
15240 15248                  /* see if somebody reopened the open stream. */
15241 15249                  if (!osp->os_force_close) {
15242 15250                          NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE,
15243 15251                              "nfs4close_one: skip CLOSE_FORCE as osp %p "
15244 15252                              "was reopened, vp %p", (void *)osp, (void *)vp));
15245 15253                          ep->error = 0;
15246 15254                          ep->stat = NFS4_OK;
15247 15255                          goto out;
15248 15256                  }
15249 15257  
15250 15258                  if (!osp->os_final_close && !did_dec_count) {
15251 15259                          osp->os_open_ref_count--;
15252 15260                          did_dec_count = 1;
15253 15261                  }
15254 15262  
15255 15263                  /*
15256 15264                   * We can't depend on os_open_ref_count being 0 due to the
15257 15265                   * way executables are opened (VN_RELE to match a VOP_OPEN).
15258 15266                   */
15259 15267  #ifdef  NOTYET
15260 15268                  ASSERT(osp->os_open_ref_count == 0);
15261 15269  #endif
15262 15270                  if (osp->os_open_ref_count != 0) {
15263 15271                          NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE,
15264 15272                              "nfs4close_one: should panic here on an "
15265 15273                              "ASSERT(osp->os_open_ref_count == 0). Ignoring "
15266 15274                              "since this is probably the exec problem."));
15267 15275  
15268 15276                          osp->os_open_ref_count = 0;
15269 15277                  }
15270 15278  
15271 15279                  /*
15272 15280                   * There is the possibility that nfs4close_one()
15273 15281                   * for close_type == CLOSE_DELMAP couldn't find the
15274 15282                   * open stream, thus couldn't decrement its os_mapcnt;
15275 15283                   * therefore we can't use this ASSERT yet.
15276 15284                   */
15277 15285  #ifdef  NOTYET
15278 15286                  ASSERT(osp->os_mapcnt == 0);
15279 15287  #endif
15280 15288                  osp->os_mapcnt = 0;
15281 15289          }
15282 15290  
15283 15291          if (close_type == CLOSE_DELMAP && !did_dec_count) {
15284 15292                  ASSERT(osp->os_mapcnt >= btopr(len));
15285 15293  
15286 15294                  if ((mmap_flags & MAP_SHARED) && (maxprot & PROT_WRITE))
15287 15295                          osp->os_mmap_write -= btopr(len);
15288 15296                  if (maxprot & PROT_READ)
15289 15297                          osp->os_mmap_read -= btopr(len);
15290 15298                  if (maxprot & PROT_EXEC)
15291 15299                          osp->os_mmap_read -= btopr(len);
15292 15300                  /* mirror the PROT_NONE check in nfs4_addmap() */
15293 15301                  if (!(maxprot & PROT_READ) && !(maxprot & PROT_WRITE) &&
15294 15302                      !(maxprot & PROT_EXEC))
15295 15303                          osp->os_mmap_read -= btopr(len);
15296 15304                  osp->os_mapcnt -= btopr(len);
15297 15305                  did_dec_count = 1;
15298 15306          }
15299 15307  
15300 15308          if (recovonly) {
15301 15309                  nfs4_lost_rqst_t lost_rqst;
15302 15310  
15303 15311                  /* request should not already be in recovery queue */
15304 15312                  ASSERT(lrp == NULL);
15305 15313                  nfs4_error_init(ep, EINTR);
15306 15314                  nfs4close_save_lost_rqst(ep->error, &lost_rqst, oop,
15307 15315                      osp, cred_otw, vp);
15308 15316                  mutex_exit(&osp->os_sync_lock);
15309 15317                  have_sync_lock = 0;
15310 15318                  (void) nfs4_start_recovery(ep, mi, vp, NULL, NULL,
15311 15319                      lost_rqst.lr_op == OP_CLOSE ?
15312 15320                      &lost_rqst : NULL, OP_CLOSE, NULL, NULL, NULL);
15313 15321                  close_failed = 1;
15314 15322                  force_close = 0;
15315 15323                  goto close_cleanup;
15316 15324          }
15317 15325  
15318 15326          /*
15319 15327           * If a previous OTW call got NFS4ERR_BAD_SEQID, then
15320 15328           * we stopped operating on the open owner's <old oo_name, old seqid>
15321 15329           * space, which means we stopped operating on the open stream
15322 15330           * too.  So don't go OTW (as the seqid is likely bad, and the
15323 15331           * stateid could be stale, potentially triggering a false
15324 15332           * setclientid), and just clean up the client's internal state.
15325 15333           */
15326 15334          if (osp->os_orig_oo_name != oop->oo_name) {
15327 15335                  NFS4_DEBUG(nfs4close_one_debug || nfs4_client_recov_debug,
15328 15336                      (CE_NOTE, "nfs4close_one: skip OTW close for osp %p "
15329 15337                      "oop %p due to bad seqid (orig oo_name %" PRIx64 " current "
15330 15338                      "oo_name %" PRIx64")",
15331 15339                      (void *)osp, (void *)oop, osp->os_orig_oo_name,
15332 15340                      oop->oo_name));
15333 15341                  close_failed = 1;
15334 15342          }
15335 15343  
15336 15344          /* If the file failed recovery, just quit. */
15337 15345          mutex_enter(&rp->r_statelock);
15338 15346          if (rp->r_flags & R4RECOVERR) {
15339 15347                  close_failed = 1;
15340 15348          }
15341 15349          mutex_exit(&rp->r_statelock);
15342 15350  
15343 15351          /*
15344 15352           * If the force close path failed to obtain start_fop
15345 15353           * then skip the OTW close and just remove the state.
15346 15354           */
15347 15355          if (close_failed)
15348 15356                  goto close_cleanup;
15349 15357  
15350 15358          /*
15351 15359           * Fifth, check to see if there are still mapped pages or other
15352 15360           * opens using this open stream.  If there are then we can't
15353 15361           * close yet but we can see if an OPEN_DOWNGRADE is necessary.
15354 15362           */
15355 15363          if (osp->os_open_ref_count > 0 || osp->os_mapcnt > 0) {
15356 15364                  nfs4_lost_rqst_t        new_lost_rqst;
15357 15365                  bool_t                  needrecov = FALSE;
15358 15366                  cred_t                  *odg_cred_otw = NULL;
15359 15367                  seqid4                  open_dg_seqid = 0;
15360 15368  
15361 15369                  if (osp->os_delegation) {
15362 15370                          /*
15363 15371                           * If this open stream was never OPENed OTW then we
15364 15372                           * surely can't DOWNGRADE it (especially since the
15365 15373                           * osp->open_stateid is really a delegation stateid
15366 15374                           * when os_delegation is 1).
15367 15375                           */
15368 15376                          if (access_bits & FREAD)
15369 15377                                  osp->os_share_acc_read--;
15370 15378                          if (access_bits & FWRITE)
15371 15379                                  osp->os_share_acc_write--;
15372 15380                          osp->os_share_deny_none--;
15373 15381                          nfs4_error_zinit(ep);
15374 15382                          goto out;
15375 15383                  }
15376 15384                  nfs4_open_downgrade(access_bits, 0, oop, osp, vp, cr,
15377 15385                      lrp, ep, &odg_cred_otw, &open_dg_seqid);
15378 15386                  needrecov = nfs4_needs_recovery(ep, TRUE, mi->mi_vfsp);
15379 15387                  if (needrecov && !isrecov) {
15380 15388                          bool_t abort;
15381 15389                          nfs4_bseqid_entry_t *bsep = NULL;
15382 15390  
15383 15391                          if (!ep->error && ep->stat == NFS4ERR_BAD_SEQID)
15384 15392                                  bsep = nfs4_create_bseqid_entry(oop, NULL,
15385 15393                                      vp, 0,
15386 15394                                      lrp ? TAG_OPEN_DG_LOST : TAG_OPEN_DG,
15387 15395                                      open_dg_seqid);
15388 15396  
15389 15397                          nfs4open_dg_save_lost_rqst(ep->error, &new_lost_rqst,
15390 15398                              oop, osp, odg_cred_otw, vp, access_bits, 0);
15391 15399                          mutex_exit(&osp->os_sync_lock);
15392 15400                          have_sync_lock = 0;
15393 15401                          abort = nfs4_start_recovery(ep, mi, vp, NULL, NULL,
15394 15402                              new_lost_rqst.lr_op == OP_OPEN_DOWNGRADE ?
15395 15403                              &new_lost_rqst : NULL, OP_OPEN_DOWNGRADE,
15396 15404                              bsep, NULL, NULL);
15397 15405                          if (odg_cred_otw)
15398 15406                                  crfree(odg_cred_otw);
15399 15407                          if (bsep)
15400 15408                                  kmem_free(bsep, sizeof (*bsep));
15401 15409  
15402 15410                          if (abort == TRUE)
15403 15411                                  goto out;
15404 15412  
15405 15413                          if (did_start_seqid_sync) {
15406 15414                                  nfs4_end_open_seqid_sync(oop);
15407 15415                                  did_start_seqid_sync = 0;
15408 15416                          }
15409 15417                          open_stream_rele(osp, rp);
15410 15418  
15411 15419                          if (did_start_op)
15412 15420                                  nfs4_end_fop(mi, vp, NULL, OH_CLOSE,
15413 15421                                      &recov_state, FALSE);
15414 15422                          if (did_force_recovlock)
15415 15423                                  nfs_rw_exit(&mi->mi_recovlock);
15416 15424  
15417 15425                          goto recov_retry;
15418 15426                  } else {
15419 15427                          if (odg_cred_otw)
15420 15428                                  crfree(odg_cred_otw);
15421 15429                  }
15422 15430                  goto out;
15423 15431          }
15424 15432  
15425 15433          /*
15426 15434           * If this open stream was created as the results of an open
15427 15435           * while holding a delegation, then just release it; no need
15428 15436           * to do an OTW close.  Otherwise do a "normal" OTW close.
15429 15437           */
15430 15438          if (osp->os_delegation) {
15431 15439                  nfs4close_notw(vp, osp, &have_sync_lock);
15432 15440                  nfs4_error_zinit(ep);
15433 15441                  goto out;
15434 15442          }
15435 15443  
15436 15444          /*
15437 15445           * If this stream is not valid, we're done.
15438 15446           */
15439 15447          if (!osp->os_valid) {
15440 15448                  nfs4_error_zinit(ep);
15441 15449                  goto out;
15442 15450          }
15443 15451  
15444 15452          /*
15445 15453           * Last open or mmap ref has vanished, need to do an OTW close.
15446 15454           * First check to see if a close is still necessary.
15447 15455           */
15448 15456          if (osp->os_failed_reopen) {
15449 15457                  NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
15450 15458                      "don't close OTW osp %p since reopen failed.",
15451 15459                      (void *)osp));
15452 15460                  /*
15453 15461                   * Reopen of the open stream failed, hence the
15454 15462                   * stateid of the open stream is invalid/stale, and
15455 15463                   * sending this OTW would incorrectly cause another
15456 15464                   * round of recovery.  In this case, we need to set
15457 15465                   * the 'os_valid' bit to 0 so another thread doesn't
15458 15466                   * come in and re-open this open stream before
15459 15467                   * this "closing" thread cleans up state (decrementing
15460 15468                   * the nfs4_server_t's state_ref_count and decrementing
15461 15469                   * the os_ref_count).
15462 15470                   */
15463 15471                  osp->os_valid = 0;
15464 15472                  /*
15465 15473                   * This removes the reference obtained at OPEN; ie,
15466 15474                   * when the open stream structure was created.
15467 15475                   *
15468 15476                   * We don't have to worry about calling 'open_stream_rele'
15469 15477                   * since we our currently holding a reference to this
15470 15478                   * open stream which means the count can not go to 0 with
15471 15479                   * this decrement.
15472 15480                   */
15473 15481                  ASSERT(osp->os_ref_count >= 2);
15474 15482                  osp->os_ref_count--;
15475 15483                  nfs4_error_zinit(ep);
15476 15484                  close_failed = 0;
15477 15485                  goto close_cleanup;
15478 15486          }
15479 15487  
15480 15488          ASSERT(osp->os_ref_count > 1);
15481 15489  
15482 15490          /*
15483 15491           * Sixth, try the CLOSE OTW.
15484 15492           */
15485 15493          nfs4close_otw(rp, cred_otw, oop, osp, &retry, &did_start_seqid_sync,
15486 15494              close_type, ep, &have_sync_lock);
15487 15495  
15488 15496          if (ep->error == EINTR || NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp)) {
15489 15497                  /*
15490 15498                   * Let the recovery thread be responsible for
15491 15499                   * removing the state for CLOSE.
15492 15500                   */
15493 15501                  close_failed = 1;
15494 15502                  force_close = 0;
15495 15503                  retry = 0;
15496 15504          }
15497 15505  
15498 15506          /* See if we need to retry with a different cred */
15499 15507          if ((ep->error == EACCES ||
15500 15508              (ep->error == 0 && ep->stat == NFS4ERR_ACCESS)) &&
15501 15509              cred_otw != cr) {
15502 15510                  crfree(cred_otw);
15503 15511                  cred_otw = cr;
15504 15512                  crhold(cred_otw);
15505 15513                  retry = 1;
15506 15514          }
15507 15515  
15508 15516          if (ep->error || ep->stat)
15509 15517                  close_failed = 1;
15510 15518  
15511 15519          if (retry && !isrecov && num_retries-- > 0) {
15512 15520                  if (have_sync_lock) {
15513 15521                          mutex_exit(&osp->os_sync_lock);
15514 15522                          have_sync_lock = 0;
15515 15523                  }
15516 15524                  if (did_start_seqid_sync) {
15517 15525                          nfs4_end_open_seqid_sync(oop);
15518 15526                          did_start_seqid_sync = 0;
15519 15527                  }
15520 15528                  open_stream_rele(osp, rp);
15521 15529  
15522 15530                  if (did_start_op)
15523 15531                          nfs4_end_fop(mi, vp, NULL, OH_CLOSE,
15524 15532                              &recov_state, FALSE);
15525 15533                  if (did_force_recovlock)
15526 15534                          nfs_rw_exit(&mi->mi_recovlock);
15527 15535                  NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
15528 15536                      "nfs4close_one: need to retry the close "
15529 15537                      "operation"));
15530 15538                  goto recov_retry;
15531 15539          }
15532 15540  close_cleanup:
15533 15541          /*
15534 15542           * Seventh and lastly, process our results.
15535 15543           */
15536 15544          if (close_failed && force_close) {
15537 15545                  /*
15538 15546                   * It's ok to drop and regrab the 'os_sync_lock' since
15539 15547                   * nfs4close_notw() will recheck to make sure the
15540 15548                   * "close"/removal of state should happen.
15541 15549                   */
15542 15550                  if (!have_sync_lock) {
15543 15551                          mutex_enter(&osp->os_sync_lock);
15544 15552                          have_sync_lock = 1;
15545 15553                  }
15546 15554                  /*
15547 15555                   * This is last call, remove the ref on the open
15548 15556                   * stream created by open and clean everything up.
15549 15557                   */
15550 15558                  osp->os_pending_close = 0;
15551 15559                  nfs4close_notw(vp, osp, &have_sync_lock);
15552 15560                  nfs4_error_zinit(ep);
15553 15561          }
15554 15562  
15555 15563          if (!close_failed) {
15556 15564                  if (have_sync_lock) {
15557 15565                          osp->os_pending_close = 0;
15558 15566                          mutex_exit(&osp->os_sync_lock);
15559 15567                          have_sync_lock = 0;
15560 15568                  } else {
15561 15569                          mutex_enter(&osp->os_sync_lock);
15562 15570                          osp->os_pending_close = 0;
15563 15571                          mutex_exit(&osp->os_sync_lock);
15564 15572                  }
15565 15573                  if (did_start_op && recov_state.rs_sp != NULL) {
15566 15574                          mutex_enter(&recov_state.rs_sp->s_lock);
15567 15575                          nfs4_dec_state_ref_count_nolock(recov_state.rs_sp, mi);
15568 15576                          mutex_exit(&recov_state.rs_sp->s_lock);
15569 15577                  } else {
15570 15578                          nfs4_dec_state_ref_count(mi);
15571 15579                  }
15572 15580                  nfs4_error_zinit(ep);
15573 15581          }
15574 15582  
15575 15583  out:
15576 15584          if (have_sync_lock)
15577 15585                  mutex_exit(&osp->os_sync_lock);
15578 15586          if (did_start_op)
15579 15587                  nfs4_end_fop(mi, vp, NULL, OH_CLOSE, &recov_state,
15580 15588                      recovonly ? TRUE : FALSE);
15581 15589          if (did_force_recovlock)
15582 15590                  nfs_rw_exit(&mi->mi_recovlock);
15583 15591          if (cred_otw)
15584 15592                  crfree(cred_otw);
15585 15593          if (osp)
15586 15594                  open_stream_rele(osp, rp);
15587 15595          if (oop) {
15588 15596                  if (did_start_seqid_sync)
15589 15597                          nfs4_end_open_seqid_sync(oop);
15590 15598                  open_owner_rele(oop);
15591 15599          }
15592 15600  }
15593 15601  
15594 15602  /*
15595 15603   * Convert information returned by the server in the LOCK4denied
15596 15604   * structure to the form required by fcntl.
15597 15605   */
15598 15606  static void
15599 15607  denied_to_flk(LOCK4denied *lockt_denied, flock64_t *flk, LOCKT4args *lockt_args)
15600 15608  {
15601 15609          nfs4_lo_name_t *lo;
15602 15610  
15603 15611  #ifdef  DEBUG
15604 15612          if (denied_to_flk_debug) {
15605 15613                  lockt_denied_debug = lockt_denied;
15606 15614                  debug_enter("lockt_denied");
15607 15615          }
15608 15616  #endif
15609 15617  
15610 15618          flk->l_type = lockt_denied->locktype == READ_LT ? F_RDLCK : F_WRLCK;
15611 15619          flk->l_whence = 0;      /* aka SEEK_SET */
15612 15620          flk->l_start = lockt_denied->offset;
15613 15621          flk->l_len = lockt_denied->length;
15614 15622  
15615 15623          /*
15616 15624           * If the blocking clientid matches our client id, then we can
15617 15625           * interpret the lockowner (since we built it).  If not, then
15618 15626           * fabricate a sysid and pid.  Note that the l_sysid field
15619 15627           * in *flk already has the local sysid.
15620 15628           */
15621 15629  
15622 15630          if (lockt_denied->owner.clientid == lockt_args->owner.clientid) {
15623 15631  
15624 15632                  if (lockt_denied->owner.owner_len == sizeof (*lo)) {
15625 15633                          lo = (nfs4_lo_name_t *)
15626 15634                              lockt_denied->owner.owner_val;
15627 15635  
15628 15636                          flk->l_pid = lo->ln_pid;
15629 15637                  } else {
15630 15638                          NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
15631 15639                              "denied_to_flk: bad lock owner length\n"));
15632 15640  
15633 15641                          flk->l_pid = lo_to_pid(&lockt_denied->owner);
15634 15642                  }
15635 15643          } else {
15636 15644                  NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
15637 15645                  "denied_to_flk: foreign clientid\n"));
15638 15646  
15639 15647                  /*
15640 15648                   * Construct a new sysid which should be different from
15641 15649                   * sysids of other systems.
15642 15650                   */
15643 15651  
15644 15652                  flk->l_sysid++;
15645 15653                  flk->l_pid = lo_to_pid(&lockt_denied->owner);
15646 15654          }
15647 15655  }
15648 15656  
15649 15657  static pid_t
15650 15658  lo_to_pid(lock_owner4 *lop)
15651 15659  {
15652 15660          pid_t pid = 0;
15653 15661          uchar_t *cp;
15654 15662          int i;
15655 15663  
15656 15664          cp = (uchar_t *)&lop->clientid;
15657 15665  
15658 15666          for (i = 0; i < sizeof (lop->clientid); i++)
15659 15667                  pid += (pid_t)*cp++;
15660 15668  
15661 15669          cp = (uchar_t *)lop->owner_val;
15662 15670  
15663 15671          for (i = 0; i < lop->owner_len; i++)
15664 15672                  pid += (pid_t)*cp++;
15665 15673  
15666 15674          return (pid);
15667 15675  }
15668 15676  
15669 15677  /*
15670 15678   * Given a lock pointer, returns the length of that lock.
15671 15679   * "end" is the last locked offset the "l_len" covers from
15672 15680   * the start of the lock.
15673 15681   */
15674 15682  static off64_t
15675 15683  lock_to_end(flock64_t *lock)
15676 15684  {
15677 15685          off64_t lock_end;
15678 15686  
15679 15687          if (lock->l_len == 0)
15680 15688                  lock_end = (off64_t)MAXEND;
15681 15689          else
15682 15690                  lock_end = lock->l_start + lock->l_len - 1;
15683 15691  
15684 15692          return (lock_end);
15685 15693  }
15686 15694  
15687 15695  /*
15688 15696   * Given the end of a lock, it will return you the length "l_len" for that lock.
15689 15697   */
15690 15698  static off64_t
15691 15699  end_to_len(off64_t start, off64_t end)
15692 15700  {
15693 15701          off64_t lock_len;
15694 15702  
15695 15703          ASSERT(end >= start);
15696 15704          if (end == MAXEND)
15697 15705                  lock_len = 0;
15698 15706          else
15699 15707                  lock_len = end - start + 1;
15700 15708  
15701 15709          return (lock_len);
15702 15710  }
15703 15711  
15704 15712  /*
15705 15713   * On given end for a lock it determines if it is the last locked offset
15706 15714   * or not, if so keeps it as is, else adds one to return the length for
15707 15715   * valid start.
15708 15716   */
15709 15717  static off64_t
15710 15718  start_check(off64_t x)
15711 15719  {
15712 15720          if (x == MAXEND)
15713 15721                  return (x);
15714 15722          else
15715 15723                  return (x + 1);
15716 15724  }
15717 15725  
15718 15726  /*
15719 15727   * See if these two locks overlap, and if so return 1;
15720 15728   * otherwise, return 0.
15721 15729   */
15722 15730  static int
15723 15731  locks_intersect(flock64_t *llfp, flock64_t *curfp)
15724 15732  {
15725 15733          off64_t llfp_end, curfp_end;
15726 15734  
15727 15735          llfp_end = lock_to_end(llfp);
15728 15736          curfp_end = lock_to_end(curfp);
15729 15737  
15730 15738          if (((llfp_end >= curfp->l_start) &&
15731 15739              (llfp->l_start <= curfp->l_start)) ||
15732 15740              ((curfp->l_start <= llfp->l_start) && (curfp_end >= llfp->l_start)))
15733 15741                  return (1);
15734 15742          return (0);
15735 15743  }
15736 15744  
15737 15745  /*
15738 15746   * Determine what the intersecting lock region is, and add that to the
15739 15747   * 'nl_llpp' locklist in increasing order (by l_start).
15740 15748   */
15741 15749  static void
15742 15750  nfs4_add_lock_range(flock64_t *lost_flp, flock64_t *local_flp,
15743 15751      locklist_t **nl_llpp, vnode_t *vp)
15744 15752  {
15745 15753          locklist_t *intersect_llp, *tmp_fllp, *cur_fllp;
15746 15754          off64_t lost_flp_end, local_flp_end, len, start;
15747 15755  
15748 15756          NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range:"));
15749 15757  
15750 15758          if (!locks_intersect(lost_flp, local_flp))
15751 15759                  return;
15752 15760  
15753 15761          NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range: "
15754 15762              "locks intersect"));
15755 15763  
15756 15764          lost_flp_end = lock_to_end(lost_flp);
15757 15765          local_flp_end = lock_to_end(local_flp);
15758 15766  
15759 15767          /* Find the starting point of the intersecting region */
15760 15768          if (local_flp->l_start > lost_flp->l_start)
15761 15769                  start = local_flp->l_start;
15762 15770          else
15763 15771                  start = lost_flp->l_start;
15764 15772  
15765 15773          /* Find the lenght of the intersecting region */
15766 15774          if (lost_flp_end < local_flp_end)
15767 15775                  len = end_to_len(start, lost_flp_end);
15768 15776          else
15769 15777                  len = end_to_len(start, local_flp_end);
15770 15778  
15771 15779          /*
15772 15780           * Prepare the flock structure for the intersection found and insert
15773 15781           * it into the new list in increasing l_start order. This list contains
15774 15782           * intersections of locks registered by the client with the local host
15775 15783           * and the lost lock.
15776 15784           * The lock type of this lock is the same as that of the local_flp.
15777 15785           */
15778 15786          intersect_llp = (locklist_t *)kmem_alloc(sizeof (locklist_t), KM_SLEEP);
15779 15787          intersect_llp->ll_flock.l_start = start;
15780 15788          intersect_llp->ll_flock.l_len = len;
15781 15789          intersect_llp->ll_flock.l_type = local_flp->l_type;
15782 15790          intersect_llp->ll_flock.l_pid = local_flp->l_pid;
15783 15791          intersect_llp->ll_flock.l_sysid = local_flp->l_sysid;
15784 15792          intersect_llp->ll_flock.l_whence = 0;   /* aka SEEK_SET */
15785 15793          intersect_llp->ll_vp = vp;
15786 15794  
15787 15795          tmp_fllp = *nl_llpp;
15788 15796          cur_fllp = NULL;
15789 15797          while (tmp_fllp != NULL && tmp_fllp->ll_flock.l_start <
15790 15798              intersect_llp->ll_flock.l_start) {
15791 15799                          cur_fllp = tmp_fllp;
15792 15800                          tmp_fllp = tmp_fllp->ll_next;
15793 15801          }
15794 15802          if (cur_fllp == NULL) {
15795 15803                  /* first on the list */
15796 15804                  intersect_llp->ll_next = *nl_llpp;
15797 15805                  *nl_llpp = intersect_llp;
15798 15806          } else {
15799 15807                  intersect_llp->ll_next = cur_fllp->ll_next;
15800 15808                  cur_fllp->ll_next = intersect_llp;
15801 15809          }
15802 15810  
15803 15811          NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range: "
15804 15812              "created lock region: start %"PRIx64" end %"PRIx64" : %s\n",
15805 15813              intersect_llp->ll_flock.l_start,
15806 15814              intersect_llp->ll_flock.l_start + intersect_llp->ll_flock.l_len,
15807 15815              intersect_llp->ll_flock.l_type == F_RDLCK ? "READ" : "WRITE"));
15808 15816  }
15809 15817  
15810 15818  /*
15811 15819   * Our local locking current state is potentially different than
15812 15820   * what the NFSv4 server thinks we have due to a lost lock that was
15813 15821   * resent and then received.  We need to reset our "NFSv4" locking
15814 15822   * state to match the current local locking state for this pid since
15815 15823   * that is what the user/application sees as what the world is.
15816 15824   *
15817 15825   * We cannot afford to drop the open/lock seqid sync since then we can
15818 15826   * get confused about what the current local locking state "is" versus
15819 15827   * "was".
15820 15828   *
15821 15829   * If we are unable to fix up the locks, we send SIGLOST to the affected
15822 15830   * process.  This is not done if the filesystem has been forcibly
15823 15831   * unmounted, in case the process has already exited and a new process
15824 15832   * exists with the same pid.
15825 15833   */
15826 15834  static void
15827 15835  nfs4_reinstitute_local_lock_state(vnode_t *vp, flock64_t *lost_flp, cred_t *cr,
15828 15836      nfs4_lock_owner_t *lop)
15829 15837  {
15830 15838          locklist_t *locks, *llp, *ri_llp, *tmp_llp;
15831 15839          mntinfo4_t *mi = VTOMI4(vp);
15832 15840          const int cmd = F_SETLK;
15833 15841          off64_t cur_start, llp_ll_flock_end, lost_flp_end;
15834 15842          flock64_t ul_fl;
15835 15843  
15836 15844          NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
15837 15845              "nfs4_reinstitute_local_lock_state"));
15838 15846  
15839 15847          /*
15840 15848           * Find active locks for this vp from the local locking code.
15841 15849           * Scan through this list and find out the locks that intersect with
15842 15850           * the lost lock. Once we find the lock that intersects, add the
15843 15851           * intersection area as a new lock to a new list "ri_llp". The lock
15844 15852           * type of the intersection region lock added to ri_llp is the same
15845 15853           * as that found in the active lock list, "list". The intersecting
15846 15854           * region locks are added to ri_llp in increasing l_start order.
15847 15855           */
15848 15856          ASSERT(nfs_zone() == mi->mi_zone);
15849 15857  
15850 15858          locks = flk_active_locks_for_vp(vp);
15851 15859          ri_llp = NULL;
15852 15860  
15853 15861          for (llp = locks; llp != NULL; llp = llp->ll_next) {
15854 15862                  ASSERT(llp->ll_vp == vp);
15855 15863                  /*
15856 15864                   * Pick locks that belong to this pid/lockowner
15857 15865                   */
15858 15866                  if (llp->ll_flock.l_pid != lost_flp->l_pid)
15859 15867                          continue;
15860 15868  
15861 15869                  nfs4_add_lock_range(lost_flp, &llp->ll_flock, &ri_llp, vp);
15862 15870          }
15863 15871  
15864 15872          /*
15865 15873           * Now we have the list of intersections with the lost lock. These are
15866 15874           * the locks that were/are active before the server replied to the
15867 15875           * last/lost lock. Issue these locks to the server here. Playing these
15868 15876           * locks to the server will re-establish aur current local locking state
15869 15877           * with the v4 server.
15870 15878           * If we get an error, send SIGLOST to the application for that lock.
15871 15879           */
15872 15880  
15873 15881          for (llp = ri_llp; llp != NULL; llp = llp->ll_next) {
15874 15882                  NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
15875 15883                      "nfs4_reinstitute_local_lock_state: need to issue "
15876 15884                      "flock: [%"PRIx64" - %"PRIx64"] : %s",
15877 15885                      llp->ll_flock.l_start,
15878 15886                      llp->ll_flock.l_start + llp->ll_flock.l_len,
15879 15887                      llp->ll_flock.l_type == F_RDLCK ? "READ" :
15880 15888                      llp->ll_flock.l_type == F_WRLCK ? "WRITE" : "INVALID"));
15881 15889                  /*
15882 15890                   * No need to relock what we already have
15883 15891                   */
15884 15892                  if (llp->ll_flock.l_type == lost_flp->l_type)
15885 15893                          continue;
15886 15894  
15887 15895                  push_reinstate(vp, cmd, &llp->ll_flock, cr, lop);
15888 15896          }
15889 15897  
15890 15898          /*
15891 15899           * Now keeping the start of the lost lock as our reference parse the
15892 15900           * newly created ri_llp locklist to find the ranges that we have locked
15893 15901           * with the v4 server but not in the current local locking. We need
15894 15902           * to unlock these ranges.
15895 15903           * These ranges can also be reffered to as those ranges, where the lost
15896 15904           * lock does not overlap with the locks in the ri_llp but are locked
15897 15905           * since the server replied to the lost lock.
15898 15906           */
15899 15907          cur_start = lost_flp->l_start;
15900 15908          lost_flp_end = lock_to_end(lost_flp);
15901 15909  
15902 15910          ul_fl.l_type = F_UNLCK;
15903 15911          ul_fl.l_whence = 0;     /* aka SEEK_SET */
15904 15912          ul_fl.l_sysid = lost_flp->l_sysid;
15905 15913          ul_fl.l_pid = lost_flp->l_pid;
15906 15914  
15907 15915          for (llp = ri_llp; llp != NULL; llp = llp->ll_next) {
15908 15916                  llp_ll_flock_end = lock_to_end(&llp->ll_flock);
15909 15917  
15910 15918                  if (llp->ll_flock.l_start <= cur_start) {
15911 15919                          cur_start = start_check(llp_ll_flock_end);
15912 15920                          continue;
15913 15921                  }
15914 15922                  NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
15915 15923                      "nfs4_reinstitute_local_lock_state: "
15916 15924                      "UNLOCK [%"PRIx64" - %"PRIx64"]",
15917 15925                      cur_start, llp->ll_flock.l_start));
15918 15926  
15919 15927                  ul_fl.l_start = cur_start;
15920 15928                  ul_fl.l_len = end_to_len(cur_start,
15921 15929                      (llp->ll_flock.l_start - 1));
15922 15930  
15923 15931                  push_reinstate(vp, cmd, &ul_fl, cr, lop);
15924 15932                  cur_start = start_check(llp_ll_flock_end);
15925 15933          }
15926 15934  
15927 15935          /*
15928 15936           * In the case where the lost lock ends after all intersecting locks,
15929 15937           * unlock the last part of the lost lock range.
15930 15938           */
15931 15939          if (cur_start != start_check(lost_flp_end)) {
15932 15940                  NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
15933 15941                      "nfs4_reinstitute_local_lock_state: UNLOCK end of the "
15934 15942                      "lost lock region [%"PRIx64" - %"PRIx64"]",
15935 15943                      cur_start, lost_flp->l_start + lost_flp->l_len));
15936 15944  
15937 15945                  ul_fl.l_start = cur_start;
15938 15946                  /*
15939 15947                   * Is it an to-EOF lock? if so unlock till the end
15940 15948                   */
15941 15949                  if (lost_flp->l_len == 0)
15942 15950                          ul_fl.l_len = 0;
15943 15951                  else
15944 15952                          ul_fl.l_len = start_check(lost_flp_end) - cur_start;
15945 15953  
15946 15954                  push_reinstate(vp, cmd, &ul_fl, cr, lop);
15947 15955          }
15948 15956  
15949 15957          if (locks != NULL)
15950 15958                  flk_free_locklist(locks);
15951 15959  
15952 15960          /* Free up our newly created locklist */
15953 15961          for (llp = ri_llp; llp != NULL; ) {
15954 15962                  tmp_llp = llp->ll_next;
15955 15963                  kmem_free(llp, sizeof (locklist_t));
15956 15964                  llp = tmp_llp;
15957 15965          }
15958 15966  
15959 15967          /*
15960 15968           * Now return back to the original calling nfs4frlock()
15961 15969           * and let us naturally drop our seqid syncs.
15962 15970           */
15963 15971  }
15964 15972  
15965 15973  /*
15966 15974   * Create a lost state record for the given lock reinstantiation request
15967 15975   * and push it onto the lost state queue.
15968 15976   */
15969 15977  static void
15970 15978  push_reinstate(vnode_t *vp, int cmd, flock64_t *flk, cred_t *cr,
15971 15979      nfs4_lock_owner_t *lop)
15972 15980  {
15973 15981          nfs4_lost_rqst_t req;
15974 15982          nfs_lock_type4 locktype;
15975 15983          nfs4_error_t e = { EINTR, NFS4_OK, RPC_SUCCESS };
15976 15984  
15977 15985          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
15978 15986  
15979 15987          locktype = flk_to_locktype(cmd, flk->l_type);
15980 15988          nfs4frlock_save_lost_rqst(NFS4_LCK_CTYPE_REINSTATE, EINTR, locktype,
15981 15989              NULL, NULL, lop, flk, &req, cr, vp);
15982 15990          (void) nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
15983 15991              (req.lr_op == OP_LOCK || req.lr_op == OP_LOCKU) ?
15984 15992              &req : NULL, flk->l_type == F_UNLCK ? OP_LOCKU : OP_LOCK,
15985 15993              NULL, NULL, NULL);
15986 15994  }

↓ open down ↓

9311 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX