Print this page
4827 nfs4: slow file locking
4837 NFSv4 client lock retry delay upper limit should be shorter


  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 /*
  26  * Copyright 2012 Nexenta Systems, Inc. All rights reserved.
  27  */
  28 
  29 /*
  30  *      Copyright 1983,1984,1985,1986,1987,1988,1989 AT&T.
  31  *      All Rights Reserved
  32  */
  33 
  34 /*
  35  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  36  */
  37 




  38 #include <sys/param.h>
  39 #include <sys/types.h>
  40 #include <sys/systm.h>
  41 #include <sys/cred.h>
  42 #include <sys/time.h>
  43 #include <sys/vnode.h>
  44 #include <sys/vfs.h>
  45 #include <sys/vfs_opreg.h>
  46 #include <sys/file.h>
  47 #include <sys/filio.h>
  48 #include <sys/uio.h>
  49 #include <sys/buf.h>
  50 #include <sys/mman.h>
  51 #include <sys/pathname.h>
  52 #include <sys/dirent.h>
  53 #include <sys/debug.h>
  54 #include <sys/vmsystm.h>
  55 #include <sys/fcntl.h>
  56 #include <sys/flock.h>
  57 #include <sys/swap.h>


 273             struct pathname *, int, vnode_t *, cred_t *,
 274             caller_context_t *, int *, pathname_t *);
 275 int     nfs4_fid(vnode_t *, fid_t *, caller_context_t *);
 276 int     nfs4_rwlock(vnode_t *, int, caller_context_t *);
 277 void    nfs4_rwunlock(vnode_t *, int, caller_context_t *);
 278 int     nfs4_realvp(vnode_t *, vnode_t **, caller_context_t *);
 279 int     nfs4_pathconf(vnode_t *, int, ulong_t *, cred_t *,
 280             caller_context_t *);
 281 int     nfs4_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
 282             caller_context_t *);
 283 int     nfs4_shrlock(vnode_t *, int, struct shrlock *, int, cred_t *,
 284             caller_context_t *);
 285 
 286 /*
 287  * Used for nfs4_commit_vp() to indicate if we should
 288  * wait on pending writes.
 289  */
 290 #define NFS4_WRITE_NOWAIT       0
 291 #define NFS4_WRITE_WAIT         1
 292 
 293 #define NFS4_BASE_WAIT_TIME 1   /* 1 second */
 294 
 295 /*
 296  * Error flags used to pass information about certain special errors
 297  * which need to be handled specially.
 298  */
 299 #define NFS_EOF                 -98
 300 #define NFS_VERF_MISMATCH       -97
 301 
 302 /*
 303  * Flags used to differentiate between which operation drove the
 304  * potential CLOSE OTW. (see nfs4_close_otw_if_necessary)
 305  */
 306 #define NFS4_CLOSE_OP           0x1
 307 #define NFS4_DELMAP_OP          0x2
 308 #define NFS4_INACTIVE_OP        0x3
 309 
 310 #define ISVDEV(t) ((t == VBLK) || (t == VCHR) || (t == VFIFO))
 311 
 312 /* ALIGN64 aligns the given buffer and adjust buffer size to 64 bit */
 313 #define ALIGN64(x, ptr, sz)                                             \
 314         x = ((uintptr_t)(ptr)) & (sizeof (uint64_t) - 1);           \


 345 
 346 static int nfs4_bio_do_stop = 0;
 347 
 348 static int nfs4_lostpage = 0;   /* number of times we lost original page */
 349 
 350 int nfs4_mmap_debug = 0;
 351 
 352 static int nfs4_pathconf_cache_hits = 0;
 353 static int nfs4_pathconf_cache_misses = 0;
 354 
 355 int nfs4close_all_cnt;
 356 int nfs4close_one_debug = 0;
 357 int nfs4close_notw_debug = 0;
 358 
 359 int denied_to_flk_debug = 0;
 360 void *lockt_denied_debug;
 361 
 362 #endif
 363 
 364 /*






 365  * How long to wait before trying again if OPEN_CONFIRM gets ETIMEDOUT
 366  * or NFS4ERR_RESOURCE.
 367  */
 368 static int confirm_retry_sec = 30;
 369 
 370 static int nfs4_lookup_neg_cache = 1;
 371 
 372 /*
 373  * number of pages to read ahead
 374  * optimized for 100 base-T.
 375  */
 376 static int nfs4_nra = 4;
 377 
 378 static int nfs4_do_symlink_cache = 1;
 379 
 380 static int nfs4_pathconf_disable_cache = 0;
 381 
 382 /*
 383  * These are the vnode ops routines which implement the vnode interface to
 384  * the networked file system.  These routines just take their parameters,


12956                 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
12957                     "nfs4frlock_get_sysid: no sysid, return ENOLCK"));
12958                 return (ENOLCK);
12959         }
12960 
12961         flk->l_sysid = lm_sysidt(*lspp);
12962 
12963         return (0);
12964 }
12965 
12966 /*
12967  * Do the remaining preliminary setup for nfs4frlock.
12968  */
12969 static void
12970 nfs4frlock_pre_setup(clock_t *tick_delayp, nfs4_recov_state_t *recov_statep,
12971     flock64_t *flk, short *whencep, vnode_t *vp, cred_t *search_cr,
12972     cred_t **cred_otw)
12973 {
12974         /*
12975          * set tick_delay to the base delay time.
12976          * (NFS4_BASE_WAIT_TIME is in secs)
12977          */
12978 
12979         *tick_delayp = drv_usectohz(NFS4_BASE_WAIT_TIME * 1000 * 1000);
12980 
12981         /*
12982          * If lock is relative to EOF, we need the newest length of the
12983          * file. Therefore invalidate the ATTR_CACHE.
12984          */
12985 
12986         *whencep = flk->l_whence;
12987 
12988         if (*whencep == 2)              /* SEEK_END */
12989                 PURGE_ATTRCACHE4(vp);
12990 
12991         recov_statep->rs_flags = 0;
12992         recov_statep->rs_num_retry_despite_err = 0;
12993         *cred_otw = nfs4_get_otw_cred(search_cr, VTOMI4(vp), NULL);
12994 }
12995 
12996 /*
12997  * Initialize and allocate the data structures necessary for
12998  * the nfs4frlock call.
12999  * Allocates argsp's op array, frees up the saved_rqstpp if there is one.


14730 
14731         /*
14732          * So, here we're going to need to retrieve the lock-owner
14733          * again (in case recovery has done a switch-a-roo) and
14734          * remove it because we can.
14735          */
14736         lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY);
14737 
14738         if (lop) {
14739                 nfs4_rnode_remove_lock_owner(rp, lop);
14740                 lock_owner_rele(lop);
14741         }
14742 
14743         nfs4_end_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 0);
14744         return (0);
14745 }
14746 
14747 /*
14748  * Wait for 'tick_delay' clock ticks.
14749  * Implement exponential backoff until hit the lease_time of this nfs4_server.
14750  * NOTE: lock_lease_time is in seconds.



14751  *
14752  * XXX For future improvements, should implement a waiting queue scheme.
14753  */
14754 static int
14755 nfs4_block_and_wait(clock_t *tick_delay, rnode4_t *rp)
14756 {
14757         long milliseconds_delay;
14758         time_t lock_lease_time;

14759 
14760         /* wait tick_delay clock ticks or siginteruptus */
14761         if (delay_sig(*tick_delay)) {
14762                 return (EINTR);
14763         }

14764         NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_block_and_wait: "
14765             "reissue the lock request: blocked for %ld clock ticks: %ld "
14766             "milliseconds", *tick_delay, drv_hztousec(*tick_delay) / 1000));
14767 
14768         /* get the lease time */
14769         lock_lease_time = r2lease_time(rp);
14770 
14771         /* drv_hztousec converts ticks to microseconds */
14772         milliseconds_delay = drv_hztousec(*tick_delay) / 1000;
14773         if (milliseconds_delay < lock_lease_time * 1000) {
14774                 *tick_delay = 2 * *tick_delay;
14775                 if (drv_hztousec(*tick_delay) > lock_lease_time * 1000 * 1000)
14776                         *tick_delay = drv_usectohz(lock_lease_time*1000*1000);



14777         }







14778         return (0);
14779 }
14780 
14781 
14782 void
14783 nfs4_vnops_init(void)
14784 {
14785 }
14786 
14787 void
14788 nfs4_vnops_fini(void)
14789 {
14790 }
14791 
14792 /*
14793  * Return a reference to the directory (parent) vnode for a given vnode,
14794  * using the saved pathname information and the directory file handle.  The
14795  * caller is responsible for disposing of the reference.
14796  * Returns zero or an errno value.
14797  *
14798  * Caller should set need_start_op to FALSE if it is the recovery
14799  * thread, or if a start_fop has already been done.  Otherwise, TRUE.
14800  */
14801 int




  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 /*
  26  * Copyright 2012 Nexenta Systems, Inc. All rights reserved.
  27  */
  28 
  29 /*
  30  *      Copyright 1983,1984,1985,1986,1987,1988,1989 AT&T.
  31  *      All Rights Reserved
  32  */
  33 
  34 /*
  35  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  36  */
  37 
  38 /*
  39  * Copyright (c) 2014, STRATO AG. All rights reserved.
  40  */
  41 
  42 #include <sys/param.h>
  43 #include <sys/types.h>
  44 #include <sys/systm.h>
  45 #include <sys/cred.h>
  46 #include <sys/time.h>
  47 #include <sys/vnode.h>
  48 #include <sys/vfs.h>
  49 #include <sys/vfs_opreg.h>
  50 #include <sys/file.h>
  51 #include <sys/filio.h>
  52 #include <sys/uio.h>
  53 #include <sys/buf.h>
  54 #include <sys/mman.h>
  55 #include <sys/pathname.h>
  56 #include <sys/dirent.h>
  57 #include <sys/debug.h>
  58 #include <sys/vmsystm.h>
  59 #include <sys/fcntl.h>
  60 #include <sys/flock.h>
  61 #include <sys/swap.h>


 277             struct pathname *, int, vnode_t *, cred_t *,
 278             caller_context_t *, int *, pathname_t *);
 279 int     nfs4_fid(vnode_t *, fid_t *, caller_context_t *);
 280 int     nfs4_rwlock(vnode_t *, int, caller_context_t *);
 281 void    nfs4_rwunlock(vnode_t *, int, caller_context_t *);
 282 int     nfs4_realvp(vnode_t *, vnode_t **, caller_context_t *);
 283 int     nfs4_pathconf(vnode_t *, int, ulong_t *, cred_t *,
 284             caller_context_t *);
 285 int     nfs4_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
 286             caller_context_t *);
 287 int     nfs4_shrlock(vnode_t *, int, struct shrlock *, int, cred_t *,
 288             caller_context_t *);
 289 
 290 /*
 291  * Used for nfs4_commit_vp() to indicate if we should
 292  * wait on pending writes.
 293  */
 294 #define NFS4_WRITE_NOWAIT       0
 295 #define NFS4_WRITE_WAIT         1
 296 


 297 /*
 298  * Error flags used to pass information about certain special errors
 299  * which need to be handled specially.
 300  */
 301 #define NFS_EOF                 -98
 302 #define NFS_VERF_MISMATCH       -97
 303 
 304 /*
 305  * Flags used to differentiate between which operation drove the
 306  * potential CLOSE OTW. (see nfs4_close_otw_if_necessary)
 307  */
 308 #define NFS4_CLOSE_OP           0x1
 309 #define NFS4_DELMAP_OP          0x2
 310 #define NFS4_INACTIVE_OP        0x3
 311 
 312 #define ISVDEV(t) ((t == VBLK) || (t == VCHR) || (t == VFIFO))
 313 
 314 /* ALIGN64 aligns the given buffer and adjust buffer size to 64 bit */
 315 #define ALIGN64(x, ptr, sz)                                             \
 316         x = ((uintptr_t)(ptr)) & (sizeof (uint64_t) - 1);           \


 347 
 348 static int nfs4_bio_do_stop = 0;
 349 
 350 static int nfs4_lostpage = 0;   /* number of times we lost original page */
 351 
 352 int nfs4_mmap_debug = 0;
 353 
 354 static int nfs4_pathconf_cache_hits = 0;
 355 static int nfs4_pathconf_cache_misses = 0;
 356 
 357 int nfs4close_all_cnt;
 358 int nfs4close_one_debug = 0;
 359 int nfs4close_notw_debug = 0;
 360 
 361 int denied_to_flk_debug = 0;
 362 void *lockt_denied_debug;
 363 
 364 #endif
 365 
 366 /*
 367  * In milliseconds. Should be less than half of the lease time or better,
 368  * less than one second.
 369  */
 370 int nfs4_base_wait_time = 20;
 371 
 372 /*
 373  * How long to wait before trying again if OPEN_CONFIRM gets ETIMEDOUT
 374  * or NFS4ERR_RESOURCE.
 375  */
 376 static int confirm_retry_sec = 30;
 377 
 378 static int nfs4_lookup_neg_cache = 1;
 379 
 380 /*
 381  * number of pages to read ahead
 382  * optimized for 100 base-T.
 383  */
 384 static int nfs4_nra = 4;
 385 
 386 static int nfs4_do_symlink_cache = 1;
 387 
 388 static int nfs4_pathconf_disable_cache = 0;
 389 
 390 /*
 391  * These are the vnode ops routines which implement the vnode interface to
 392  * the networked file system.  These routines just take their parameters,


12964                 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
12965                     "nfs4frlock_get_sysid: no sysid, return ENOLCK"));
12966                 return (ENOLCK);
12967         }
12968 
12969         flk->l_sysid = lm_sysidt(*lspp);
12970 
12971         return (0);
12972 }
12973 
12974 /*
12975  * Do the remaining preliminary setup for nfs4frlock.
12976  */
12977 static void
12978 nfs4frlock_pre_setup(clock_t *tick_delayp, nfs4_recov_state_t *recov_statep,
12979     flock64_t *flk, short *whencep, vnode_t *vp, cred_t *search_cr,
12980     cred_t **cred_otw)
12981 {
12982         /*
12983          * set tick_delay to the base delay time.
12984          * (nfs4_base_wait_time is in msecs)
12985          */
12986 
12987         *tick_delayp = drv_usectohz(nfs4_base_wait_time * 1000);
12988 
12989         /*
12990          * If lock is relative to EOF, we need the newest length of the
12991          * file. Therefore invalidate the ATTR_CACHE.
12992          */
12993 
12994         *whencep = flk->l_whence;
12995 
12996         if (*whencep == 2)              /* SEEK_END */
12997                 PURGE_ATTRCACHE4(vp);
12998 
12999         recov_statep->rs_flags = 0;
13000         recov_statep->rs_num_retry_despite_err = 0;
13001         *cred_otw = nfs4_get_otw_cred(search_cr, VTOMI4(vp), NULL);
13002 }
13003 
13004 /*
13005  * Initialize and allocate the data structures necessary for
13006  * the nfs4frlock call.
13007  * Allocates argsp's op array, frees up the saved_rqstpp if there is one.


14738 
14739         /*
14740          * So, here we're going to need to retrieve the lock-owner
14741          * again (in case recovery has done a switch-a-roo) and
14742          * remove it because we can.
14743          */
14744         lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY);
14745 
14746         if (lop) {
14747                 nfs4_rnode_remove_lock_owner(rp, lop);
14748                 lock_owner_rele(lop);
14749         }
14750 
14751         nfs4_end_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 0);
14752         return (0);
14753 }
14754 
14755 /*
14756  * Wait for 'tick_delay' clock ticks.
14757  * Implement exponential backoff until hit the lease_time of this nfs4_server.
14758  *
14759  * The client should retry to acquire the lock faster than the lease period.
14760  * We use roughly half of the lease time to use a similar calculation as it is
14761  * used in nfs4_renew_lease_thread().
14762  *
14763  * XXX For future improvements, should implement a waiting queue scheme.
14764  */
14765 static int
14766 nfs4_block_and_wait(clock_t *tick_delay, rnode4_t *rp)
14767 {
14768         long max_msec_delay = 1 * 1000;         /* 1 sec */
14769         nfs4_server_t *sp;
14770         mntinfo4_t *mi = VTOMI4(RTOV4(rp));
14771 
14772         /* wait tick_delay clock ticks or siginteruptus */
14773         if (delay_sig(*tick_delay)) {
14774                 return (EINTR);
14775         }
14776 
14777         NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_block_and_wait: "
14778             "reissue the lock request: blocked for %ld clock ticks: %ld "
14779             "milliseconds", *tick_delay, drv_hztousec(*tick_delay) / 1000));
14780 
14781         /*
14782          * Get the current lease time and propagation time for the server
14783          * associated with the given file. Note that both times could
14784          * change immediately after this section.
14785          */
14786         nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0);
14787         sp = find_nfs4_server(mi);
14788         if (sp != NULL) {
14789                 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)) {
14790                         max_msec_delay = sp->s_lease_time * 1000 / 2 -
14791                                         (3 * sp->propagation_delay.tv_sec *
14792                                         1000);
14793                 }
14794                 mutex_exit(&sp->s_lock);
14795                 nfs4_server_rele(sp);
14796         }
14797         nfs_rw_exit(&mi->mi_recovlock);
14798 
14799         max_msec_delay = MAX(max_msec_delay, nfs4_base_wait_time);
14800         *tick_delay = MIN(drv_usectohz(max_msec_delay * 1000), *tick_delay * 2);
14801         return (0);
14802 }
14803 

14804 void
14805 nfs4_vnops_init(void)
14806 {
14807 }
14808 
14809 void
14810 nfs4_vnops_fini(void)
14811 {
14812 }
14813 
14814 /*
14815  * Return a reference to the directory (parent) vnode for a given vnode,
14816  * using the saved pathname information and the directory file handle.  The
14817  * caller is responsible for disposing of the reference.
14818  * Returns zero or an errno value.
14819  *
14820  * Caller should set need_start_op to FALSE if it is the recovery
14821  * thread, or if a start_fop has already been done.  Otherwise, TRUE.
14822  */
14823 int