illumos-gate New usr/src/uts/common/fs/nfs/nfs4

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  24  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  25  */
  26 
  27 /*
  28  *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
  29  *      All Rights Reserved
  30  */
  31 
  32 #include <sys/param.h>
  33 #include <sys/types.h>
  34 #include <sys/systm.h>
  35 #include <sys/cred.h>
  36 #include <sys/vfs.h>
  37 #include <sys/vfs_opreg.h>
  38 #include <sys/vnode.h>
  39 #include <sys/pathname.h>
  40 #include <sys/sysmacros.h>
  41 #include <sys/kmem.h>
  42 #include <sys/mkdev.h>
  43 #include <sys/mount.h>
  44 #include <sys/statvfs.h>
  45 #include <sys/errno.h>
  46 #include <sys/debug.h>
  47 #include <sys/cmn_err.h>
  48 #include <sys/utsname.h>
  49 #include <sys/bootconf.h>
  50 #include <sys/modctl.h>
  51 #include <sys/acl.h>
  52 #include <sys/flock.h>
  53 #include <sys/time.h>
  54 #include <sys/disp.h>
  55 #include <sys/policy.h>
  56 #include <sys/socket.h>
  57 #include <sys/netconfig.h>
  58 #include <sys/dnlc.h>
  59 #include <sys/list.h>
  60 #include <sys/mntent.h>
  61 #include <sys/tsol/label.h>
  62 
  63 #include <rpc/types.h>
  64 #include <rpc/auth.h>
  65 #include <rpc/rpcsec_gss.h>
  66 #include <rpc/clnt.h>
  67 
  68 #include <nfs/nfs.h>
  69 #include <nfs/nfs_clnt.h>
  70 #include <nfs/mount.h>
  71 #include <nfs/nfs_acl.h>
  72 
  73 #include <fs/fs_subr.h>
  74 
  75 #include <nfs/nfs4.h>
  76 #include <nfs/rnode4.h>
  77 #include <nfs/nfs4_clnt.h>
  78 #include <sys/fs/autofs.h>
  79 
  80 #include <sys/sdt.h>
  81 
  82 
  83 /*
  84  * Arguments passed to thread to free data structures from forced unmount.
  85  */
  86 
  87 typedef struct {
  88         vfs_t   *fm_vfsp;
  89         int     fm_flag;
  90         cred_t  *fm_cr;
  91 } freemountargs_t;
  92 
  93 static void     async_free_mount(vfs_t *, int, cred_t *);
  94 static void     nfs4_free_mount(vfs_t *, int, cred_t *);
  95 static void     nfs4_free_mount_thread(freemountargs_t *);
  96 static int nfs4_chkdup_servinfo4(servinfo4_t *, servinfo4_t *);
  97 
  98 /*
  99  * From rpcsec module (common/rpcsec).
 100  */
 101 extern int sec_clnt_loadinfo(struct sec_data *, struct sec_data **, model_t);
 102 extern void sec_clnt_freeinfo(struct sec_data *);
 103 
 104 /*
 105  * The order and contents of this structure must be kept in sync with that of
 106  * rfsreqcnt_v4_tmpl in nfs_stats.c
 107  */
 108 static char *rfsnames_v4[] = {
 109         "null", "compound", "reserved", "access", "close", "commit", "create",
 110         "delegpurge", "delegreturn", "getattr", "getfh", "link", "lock",
 111         "lockt", "locku", "lookup", "lookupp", "nverify", "open", "openattr",
 112         "open_confirm", "open_downgrade", "putfh", "putpubfh", "putrootfh",
 113         "read", "readdir", "readlink", "remove", "rename", "renew",
 114         "restorefh", "savefh", "secinfo", "setattr", "setclientid",
 115         "setclientid_confirm", "verify", "write"
 116 };
 117 
 118 /*
 119  * nfs4_max_mount_retry is the number of times the client will redrive
 120  * a mount compound before giving up and returning failure.  The intent
 121  * is to redrive mount compounds which fail NFS4ERR_STALE so that
 122  * if a component of the server path being mounted goes stale, it can
 123  * "recover" by redriving the mount compund (LOOKUP ops).  This recovery
 124  * code is needed outside of the recovery framework because mount is a
 125  * special case.  The client doesn't create vnodes/rnodes for components
 126  * of the server path being mounted.  The recovery code recovers real
 127  * client objects, not STALE FHs which map to components of the server
 128  * path being mounted.
 129  *
 130  * We could just fail the mount on the first time, but that would
 131  * instantly trigger failover (from nfs4_mount), and the client should
 132  * try to re-lookup the STALE FH before doing failover.  The easiest
 133  * way to "re-lookup" is to simply redrive the mount compound.
 134  */
 135 static int nfs4_max_mount_retry = 2;
 136 
 137 /*
 138  * nfs4 vfs operations.
 139  */
 140 int             nfs4_mount(vfs_t *, vnode_t *, struct mounta *, cred_t *);
 141 static int      nfs4_unmount(vfs_t *, int, cred_t *);
 142 static int      nfs4_root(vfs_t *, vnode_t **);
 143 static int      nfs4_statvfs(vfs_t *, struct statvfs64 *);
 144 static int      nfs4_sync(vfs_t *, short, cred_t *);
 145 static int      nfs4_vget(vfs_t *, vnode_t **, fid_t *);
 146 static int      nfs4_mountroot(vfs_t *, whymountroot_t);
 147 static void     nfs4_freevfs(vfs_t *);
 148 
 149 static int      nfs4rootvp(vnode_t **, vfs_t *, struct servinfo4 *,
 150                     int, cred_t *, zone_t *);
 151 
 152 vfsops_t        *nfs4_vfsops;
 153 
 154 int nfs4_vfsinit(void);
 155 void nfs4_vfsfini(void);
 156 static void nfs4setclientid_init(void);
 157 static void nfs4setclientid_fini(void);
 158 static void nfs4setclientid_otw(mntinfo4_t *, servinfo4_t *,  cred_t *,
 159                 struct nfs4_server *, nfs4_error_t *, int *);
 160 static void     destroy_nfs4_server(nfs4_server_t *);
 161 static void     remove_mi(nfs4_server_t *, mntinfo4_t *);
 162 
 163 extern void nfs4_ephemeral_init(void);
 164 extern void nfs4_ephemeral_fini(void);
 165 
 166 /* referral related routines */
 167 static servinfo4_t *copy_svp(servinfo4_t *);
 168 static void free_knconf_contents(struct knetconfig *k);
 169 static char *extract_referral_point(const char *, int);
 170 static void setup_newsvpath(servinfo4_t *, int);
 171 static void update_servinfo4(servinfo4_t *, fs_location4 *,
 172                 struct nfs_fsl_info *, char *, int);
 173 
 174 /*
 175  * Initialize the vfs structure
 176  */
 177 
 178 static int nfs4fstyp;
 179 
 180 
 181 /*
 182  * Debug variable to check for rdma based
 183  * transport startup and cleanup. Controlled
 184  * through /etc/system. Off by default.
 185  */
 186 extern int rdma_debug;
 187 
 188 int
 189 nfs4init(int fstyp, char *name)
 190 {
 191         static const fs_operation_def_t nfs4_vfsops_template[] = {
 192                 { VFSNAME_MOUNT,        { .vfs_mount = nfs4_mount } },
 193                 { VFSNAME_UNMOUNT,      { .vfs_unmount = nfs4_unmount } },
 194                 { VFSNAME_ROOT,         { .vfs_root = nfs4_root } },
 195                 { VFSNAME_STATVFS,      { .vfs_statvfs = nfs4_statvfs } },
 196                 { VFSNAME_SYNC,         { .vfs_sync = nfs4_sync } },
 197                 { VFSNAME_VGET,         { .vfs_vget = nfs4_vget } },
 198                 { VFSNAME_MOUNTROOT,    { .vfs_mountroot = nfs4_mountroot } },
 199                 { VFSNAME_FREEVFS,      { .vfs_freevfs = nfs4_freevfs } },
 200                 { NULL,                 { NULL } }
 201         };
 202         int error;
 203 
 204         nfs4_vfsops = NULL;
 205         nfs4_vnodeops = NULL;
 206         nfs4_trigger_vnodeops = NULL;
 207 
 208         error = vfs_setfsops(fstyp, nfs4_vfsops_template, &nfs4_vfsops);
 209         if (error != 0) {
 210                 zcmn_err(GLOBAL_ZONEID, CE_WARN,
 211                     "nfs4init: bad vfs ops template");
 212                 goto out;
 213         }
 214 
 215         error = vn_make_ops(name, nfs4_vnodeops_template, &nfs4_vnodeops);
 216         if (error != 0) {
 217                 zcmn_err(GLOBAL_ZONEID, CE_WARN,
 218                     "nfs4init: bad vnode ops template");
 219                 goto out;
 220         }
 221 
 222         error = vn_make_ops("nfs4_trigger", nfs4_trigger_vnodeops_template,
 223             &nfs4_trigger_vnodeops);
 224         if (error != 0) {
 225                 zcmn_err(GLOBAL_ZONEID, CE_WARN,
 226                     "nfs4init: bad trigger vnode ops template");
 227                 goto out;
 228         }
 229 
 230         nfs4fstyp = fstyp;
 231         (void) nfs4_vfsinit();
 232         (void) nfs4_init_dot_entries();
 233 
 234 out:
 235         if (error) {
 236                 if (nfs4_trigger_vnodeops != NULL)
 237                         vn_freevnodeops(nfs4_trigger_vnodeops);
 238 
 239                 if (nfs4_vnodeops != NULL)
 240                         vn_freevnodeops(nfs4_vnodeops);
 241 
 242                 (void) vfs_freevfsops_by_type(fstyp);
 243         }
 244 
 245         return (error);
 246 }
 247 
 248 void
 249 nfs4fini(void)
 250 {
 251         (void) nfs4_destroy_dot_entries();
 252         nfs4_vfsfini();
 253 }
 254 
 255 /*
 256  * Create a new sec_data structure to store AUTH_DH related data:
 257  * netname, syncaddr, knetconfig. There is no AUTH_F_RPCTIMESYNC
 258  * flag set for NFS V4 since we are avoiding to contact the rpcbind
 259  * daemon and is using the IP time service (IPPORT_TIMESERVER).
 260  *
 261  * sec_data can be freed by sec_clnt_freeinfo().
 262  */
 263 static struct sec_data *
 264 create_authdh_data(char *netname, int nlen, struct netbuf *syncaddr,
 265                 struct knetconfig *knconf) {
 266         struct sec_data *secdata;
 267         dh_k4_clntdata_t *data;
 268         char *pf, *p;
 269 
 270         if (syncaddr == NULL || syncaddr->buf == NULL || nlen == 0)
 271                 return (NULL);
 272 
 273         secdata = kmem_alloc(sizeof (*secdata), KM_SLEEP);
 274         secdata->flags = 0;
 275 
 276         data = kmem_alloc(sizeof (*data), KM_SLEEP);
 277 
 278         data->syncaddr.maxlen = syncaddr->maxlen;
 279         data->syncaddr.len = syncaddr->len;
 280         data->syncaddr.buf = (char *)kmem_alloc(syncaddr->len, KM_SLEEP);
 281         bcopy(syncaddr->buf, data->syncaddr.buf, syncaddr->len);
 282 
 283         /*
 284          * duplicate the knconf information for the
 285          * new opaque data.
 286          */
 287         data->knconf = kmem_alloc(sizeof (*knconf), KM_SLEEP);
 288         *data->knconf = *knconf;
 289         pf = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
 290         p = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
 291         bcopy(knconf->knc_protofmly, pf, KNC_STRSIZE);
 292         bcopy(knconf->knc_proto, p, KNC_STRSIZE);
 293         data->knconf->knc_protofmly = pf;
 294         data->knconf->knc_proto = p;
 295 
 296         /* move server netname to the sec_data structure */
 297         data->netname = kmem_alloc(nlen, KM_SLEEP);
 298         bcopy(netname, data->netname, nlen);
 299         data->netnamelen = (int)nlen;
 300 
 301         secdata->secmod = AUTH_DH;
 302         secdata->rpcflavor = AUTH_DH;
 303         secdata->data = (caddr_t)data;
 304 
 305         return (secdata);
 306 }
 307 
 308 /*
 309  * Returns (deep) copy of sec_data_t. Allocates all memory required; caller
 310  * is responsible for freeing.
 311  */
 312 sec_data_t *
 313 copy_sec_data(sec_data_t *fsecdata) {
 314         sec_data_t *tsecdata;
 315 
 316         if (fsecdata == NULL)
 317                 return (NULL);
 318 
 319         if (fsecdata->rpcflavor == AUTH_DH) {
 320                 dh_k4_clntdata_t *fdata = (dh_k4_clntdata_t *)fsecdata->data;
 321 
 322                 if (fdata == NULL)
 323                         return (NULL);
 324 
 325                 tsecdata = (sec_data_t *)create_authdh_data(fdata->netname,
 326                     fdata->netnamelen, &fdata->syncaddr, fdata->knconf);
 327 
 328                 return (tsecdata);
 329         }
 330 
 331         tsecdata = kmem_zalloc(sizeof (sec_data_t), KM_SLEEP);
 332 
 333         tsecdata->secmod = fsecdata->secmod;
 334         tsecdata->rpcflavor = fsecdata->rpcflavor;
 335         tsecdata->flags = fsecdata->flags;
 336         tsecdata->uid = fsecdata->uid;
 337 
 338         if (fsecdata->rpcflavor == RPCSEC_GSS) {
 339                 gss_clntdata_t *gcd = (gss_clntdata_t *)fsecdata->data;
 340 
 341                 tsecdata->data = (caddr_t)copy_sec_data_gss(gcd);
 342         } else {
 343                 tsecdata->data = NULL;
 344         }
 345 
 346         return (tsecdata);
 347 }
 348 
 349 gss_clntdata_t *
 350 copy_sec_data_gss(gss_clntdata_t *fdata)
 351 {
 352         gss_clntdata_t *tdata;
 353 
 354         if (fdata == NULL)
 355                 return (NULL);
 356 
 357         tdata = kmem_zalloc(sizeof (gss_clntdata_t), KM_SLEEP);
 358 
 359         tdata->mechanism.length = fdata->mechanism.length;
 360         tdata->mechanism.elements = kmem_zalloc(fdata->mechanism.length,
 361             KM_SLEEP);
 362         bcopy(fdata->mechanism.elements, tdata->mechanism.elements,
 363             fdata->mechanism.length);
 364 
 365         tdata->service = fdata->service;
 366 
 367         (void) strcpy(tdata->uname, fdata->uname);
 368         (void) strcpy(tdata->inst, fdata->inst);
 369         (void) strcpy(tdata->realm, fdata->realm);
 370 
 371         tdata->qop = fdata->qop;
 372 
 373         return (tdata);
 374 }
 375 
 376 static int
 377 nfs4_chkdup_servinfo4(servinfo4_t *svp_head, servinfo4_t *svp)
 378 {
 379         servinfo4_t *si;
 380 
 381         /*
 382          * Iterate over the servinfo4 list to make sure
 383          * we do not have a duplicate. Skip any servinfo4
 384          * that has been marked "NOT IN USE"
 385          */
 386         for (si = svp_head; si; si = si->sv_next) {
 387                 (void) nfs_rw_enter_sig(&si->sv_lock, RW_READER, 0);
 388                 if (si->sv_flags & SV4_NOTINUSE) {
 389                         nfs_rw_exit(&si->sv_lock);
 390                         continue;
 391                 }
 392                 nfs_rw_exit(&si->sv_lock);
 393                 if (si == svp)
 394                         continue;
 395                 if (si->sv_addr.len == svp->sv_addr.len &&
 396                     strcmp(si->sv_knconf->knc_protofmly,
 397                     svp->sv_knconf->knc_protofmly) == 0 &&
 398                     bcmp(si->sv_addr.buf, svp->sv_addr.buf,
 399                     si->sv_addr.len) == 0) {
 400                         /* it's a duplicate */
 401                         return (1);
 402                 }
 403         }
 404         /* it's not a duplicate */
 405         return (0);
 406 }
 407 
 408 void
 409 nfs4_free_args(struct nfs_args *nargs)
 410 {
 411         if (nargs->knconf) {
 412                 if (nargs->knconf->knc_protofmly)
 413                         kmem_free(nargs->knconf->knc_protofmly,
 414                             KNC_STRSIZE);
 415                 if (nargs->knconf->knc_proto)
 416                         kmem_free(nargs->knconf->knc_proto, KNC_STRSIZE);
 417                 kmem_free(nargs->knconf, sizeof (*nargs->knconf));
 418                 nargs->knconf = NULL;
 419         }
 420 
 421         if (nargs->fh) {
 422                 kmem_free(nargs->fh, strlen(nargs->fh) + 1);
 423                 nargs->fh = NULL;
 424         }
 425 
 426         if (nargs->hostname) {
 427                 kmem_free(nargs->hostname, strlen(nargs->hostname) + 1);
 428                 nargs->hostname = NULL;
 429         }
 430 
 431         if (nargs->addr) {
 432                 if (nargs->addr->buf) {
 433                         ASSERT(nargs->addr->len);
 434                         kmem_free(nargs->addr->buf, nargs->addr->len);
 435                 }
 436                 kmem_free(nargs->addr, sizeof (struct netbuf));
 437                 nargs->addr = NULL;
 438         }
 439 
 440         if (nargs->syncaddr) {
 441                 ASSERT(nargs->syncaddr->len);
 442                 if (nargs->syncaddr->buf) {
 443                         ASSERT(nargs->syncaddr->len);
 444                         kmem_free(nargs->syncaddr->buf, nargs->syncaddr->len);
 445                 }
 446                 kmem_free(nargs->syncaddr, sizeof (struct netbuf));
 447                 nargs->syncaddr = NULL;
 448         }
 449 
 450         if (nargs->netname) {
 451                 kmem_free(nargs->netname, strlen(nargs->netname) + 1);
 452                 nargs->netname = NULL;
 453         }
 454 
 455         if (nargs->nfs_ext_u.nfs_extA.secdata) {
 456                 sec_clnt_freeinfo(
 457                     nargs->nfs_ext_u.nfs_extA.secdata);
 458                 nargs->nfs_ext_u.nfs_extA.secdata = NULL;
 459         }
 460 }
 461 
 462 
 463 int
 464 nfs4_copyin(char *data, int datalen, struct nfs_args *nargs)
 465 {
 466 
 467         int error;
 468         size_t hlen;                    /* length of hostname */
 469         size_t nlen;                    /* length of netname */
 470         char netname[MAXNETNAMELEN+1];  /* server's netname */
 471         struct netbuf addr;             /* server's address */
 472         struct netbuf syncaddr;         /* AUTH_DES time sync addr */
 473         struct knetconfig *knconf;              /* transport structure */
 474         struct sec_data *secdata = NULL;        /* security data */
 475         STRUCT_DECL(nfs_args, args);            /* nfs mount arguments */
 476         STRUCT_DECL(knetconfig, knconf_tmp);
 477         STRUCT_DECL(netbuf, addr_tmp);
 478         int flags;
 479         char *p, *pf;
 480         struct pathname pn;
 481         char *userbufptr;
 482 
 483 
 484         bzero(nargs, sizeof (*nargs));
 485 
 486         STRUCT_INIT(args, get_udatamodel());
 487         bzero(STRUCT_BUF(args), SIZEOF_STRUCT(nfs_args, DATAMODEL_NATIVE));
 488         if (copyin(data, STRUCT_BUF(args), MIN(datalen,
 489             STRUCT_SIZE(args))))
 490                 return (EFAULT);
 491 
 492         nargs->wsize = STRUCT_FGET(args, wsize);
 493         nargs->rsize = STRUCT_FGET(args, rsize);
 494         nargs->timeo = STRUCT_FGET(args, timeo);
 495         nargs->retrans = STRUCT_FGET(args, retrans);
 496         nargs->acregmin = STRUCT_FGET(args, acregmin);
 497         nargs->acregmax = STRUCT_FGET(args, acregmax);
 498         nargs->acdirmin = STRUCT_FGET(args, acdirmin);
 499         nargs->acdirmax = STRUCT_FGET(args, acdirmax);
 500 
 501         flags = STRUCT_FGET(args, flags);
 502         nargs->flags = flags;
 503 
 504         addr.buf = NULL;
 505         syncaddr.buf = NULL;
 506 
 507 
 508         /*
 509          * Allocate space for a knetconfig structure and
 510          * its strings and copy in from user-land.
 511          */
 512         knconf = kmem_zalloc(sizeof (*knconf), KM_SLEEP);
 513         STRUCT_INIT(knconf_tmp, get_udatamodel());
 514         if (copyin(STRUCT_FGETP(args, knconf), STRUCT_BUF(knconf_tmp),
 515             STRUCT_SIZE(knconf_tmp))) {
 516                 kmem_free(knconf, sizeof (*knconf));
 517                 return (EFAULT);
 518         }
 519 
 520         knconf->knc_semantics = STRUCT_FGET(knconf_tmp, knc_semantics);
 521         knconf->knc_protofmly = STRUCT_FGETP(knconf_tmp, knc_protofmly);
 522         knconf->knc_proto = STRUCT_FGETP(knconf_tmp, knc_proto);
 523         if (get_udatamodel() != DATAMODEL_LP64) {
 524                 knconf->knc_rdev = expldev(STRUCT_FGET(knconf_tmp, knc_rdev));
 525         } else {
 526                 knconf->knc_rdev = STRUCT_FGET(knconf_tmp, knc_rdev);
 527         }
 528 
 529         pf = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
 530         p = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
 531         error = copyinstr(knconf->knc_protofmly, pf, KNC_STRSIZE, NULL);
 532         if (error) {
 533                 kmem_free(pf, KNC_STRSIZE);
 534                 kmem_free(p, KNC_STRSIZE);
 535                 kmem_free(knconf, sizeof (*knconf));
 536                 return (error);
 537         }
 538 
 539         error = copyinstr(knconf->knc_proto, p, KNC_STRSIZE, NULL);
 540         if (error) {
 541                 kmem_free(pf, KNC_STRSIZE);
 542                 kmem_free(p, KNC_STRSIZE);
 543                 kmem_free(knconf, sizeof (*knconf));
 544                 return (error);
 545         }
 546 
 547 
 548         knconf->knc_protofmly = pf;
 549         knconf->knc_proto = p;
 550 
 551         nargs->knconf = knconf;
 552 
 553         /*
 554          * Get server address
 555          */
 556         STRUCT_INIT(addr_tmp, get_udatamodel());
 557         if (copyin(STRUCT_FGETP(args, addr), STRUCT_BUF(addr_tmp),
 558             STRUCT_SIZE(addr_tmp))) {
 559                 error = EFAULT;
 560                 goto errout;
 561         }
 562 
 563         nargs->addr = kmem_zalloc(sizeof (struct netbuf), KM_SLEEP);
 564         userbufptr = STRUCT_FGETP(addr_tmp, buf);
 565         addr.len = STRUCT_FGET(addr_tmp, len);
 566         addr.buf = kmem_alloc(addr.len, KM_SLEEP);
 567         addr.maxlen = addr.len;
 568         if (copyin(userbufptr, addr.buf, addr.len)) {
 569                 kmem_free(addr.buf, addr.len);
 570                 error = EFAULT;
 571                 goto errout;
 572         }
 573         bcopy(&addr, nargs->addr, sizeof (struct netbuf));
 574 
 575         /*
 576          * Get the root fhandle
 577          */
 578         error = pn_get(STRUCT_FGETP(args, fh), UIO_USERSPACE, &pn);
 579         if (error)
 580                 goto errout;
 581 
 582         /* Volatile fh: keep server paths, so use actual-size strings */
 583         nargs->fh = kmem_alloc(pn.pn_pathlen + 1, KM_SLEEP);
 584         bcopy(pn.pn_path, nargs->fh, pn.pn_pathlen);
 585         nargs->fh[pn.pn_pathlen] = '\0';
 586         pn_free(&pn);
 587 
 588 
 589         /*
 590          * Get server's hostname
 591          */
 592         if (flags & NFSMNT_HOSTNAME) {
 593                 error = copyinstr(STRUCT_FGETP(args, hostname),
 594                     netname, sizeof (netname), &hlen);
 595                 if (error)
 596                         goto errout;
 597                 nargs->hostname = kmem_zalloc(hlen, KM_SLEEP);
 598                 (void) strcpy(nargs->hostname, netname);
 599 
 600         } else {
 601                 nargs->hostname = NULL;
 602         }
 603 
 604 
 605         /*
 606          * If there are syncaddr and netname data, load them in. This is
 607          * to support data needed for NFSV4 when AUTH_DH is the negotiated
 608          * flavor via SECINFO. (instead of using MOUNT protocol in V3).
 609          */
 610         netname[0] = '\0';
 611         if (flags & NFSMNT_SECURE) {
 612 
 613                 /* get syncaddr */
 614                 STRUCT_INIT(addr_tmp, get_udatamodel());
 615                 if (copyin(STRUCT_FGETP(args, syncaddr), STRUCT_BUF(addr_tmp),
 616                     STRUCT_SIZE(addr_tmp))) {
 617                         error = EINVAL;
 618                         goto errout;
 619                 }
 620                 userbufptr = STRUCT_FGETP(addr_tmp, buf);
 621                 syncaddr.len = STRUCT_FGET(addr_tmp, len);
 622                 syncaddr.buf = kmem_alloc(syncaddr.len, KM_SLEEP);
 623                 syncaddr.maxlen = syncaddr.len;
 624                 if (copyin(userbufptr, syncaddr.buf, syncaddr.len)) {
 625                         kmem_free(syncaddr.buf, syncaddr.len);
 626                         error = EFAULT;
 627                         goto errout;
 628                 }
 629 
 630                 nargs->syncaddr = kmem_alloc(sizeof (struct netbuf), KM_SLEEP);
 631                 bcopy(&syncaddr, nargs->syncaddr, sizeof (struct netbuf));
 632 
 633                 /* get server's netname */
 634                 if (copyinstr(STRUCT_FGETP(args, netname), netname,
 635                     sizeof (netname), &nlen)) {
 636                         error = EFAULT;
 637                         goto errout;
 638                 }
 639 
 640                 netname[nlen] = '\0';
 641                 nargs->netname = kmem_zalloc(nlen, KM_SLEEP);
 642                 (void) strcpy(nargs->netname, netname);
 643         }
 644 
 645         /*
 646          * Get the extention data which has the security data structure.
 647          * This includes data for AUTH_SYS as well.
 648          */
 649         if (flags & NFSMNT_NEWARGS) {
 650                 nargs->nfs_args_ext = STRUCT_FGET(args, nfs_args_ext);
 651                 if (nargs->nfs_args_ext == NFS_ARGS_EXTA ||
 652                     nargs->nfs_args_ext == NFS_ARGS_EXTB) {
 653                         /*
 654                          * Indicating the application is using the new
 655                          * sec_data structure to pass in the security
 656                          * data.
 657                          */
 658                         if (STRUCT_FGETP(args,
 659                             nfs_ext_u.nfs_extA.secdata) != NULL) {
 660                                 error = sec_clnt_loadinfo(
 661                                     (struct sec_data *)STRUCT_FGETP(args,
 662                                     nfs_ext_u.nfs_extA.secdata),
 663                                     &secdata, get_udatamodel());
 664                         }
 665                         nargs->nfs_ext_u.nfs_extA.secdata = secdata;
 666                 }
 667         }
 668 
 669         if (error)
 670                 goto errout;
 671 
 672         /*
 673          * Failover support:
 674          *
 675          * We may have a linked list of nfs_args structures,
 676          * which means the user is looking for failover.  If
 677          * the mount is either not "read-only" or "soft",
 678          * we want to bail out with EINVAL.
 679          */
 680         if (nargs->nfs_args_ext == NFS_ARGS_EXTB)
 681                 nargs->nfs_ext_u.nfs_extB.next =
 682                     STRUCT_FGETP(args, nfs_ext_u.nfs_extB.next);
 683 
 684 errout:
 685         if (error)
 686                 nfs4_free_args(nargs);
 687 
 688         return (error);
 689 }
 690 
 691 
 692 /*
 693  * nfs mount vfsop
 694  * Set up mount info record and attach it to vfs struct.
 695  */
 696 int
 697 nfs4_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
 698 {
 699         char *data = uap->dataptr;
 700         int error;
 701         vnode_t *rtvp;                  /* the server's root */
 702         mntinfo4_t *mi;                 /* mount info, pointed at by vfs */
 703         struct knetconfig *rdma_knconf; /* rdma transport structure */
 704         rnode4_t *rp;
 705         struct servinfo4 *svp;          /* nfs server info */
 706         struct servinfo4 *svp_tail = NULL; /* previous nfs server info */
 707         struct servinfo4 *svp_head;     /* first nfs server info */
 708         struct servinfo4 *svp_2ndlast;  /* 2nd last in server info list */
 709         struct sec_data *secdata;       /* security data */
 710         struct nfs_args *args = NULL;
 711         int flags, addr_type, removed;
 712         zone_t *zone = nfs_zone();
 713         nfs4_error_t n4e;
 714         zone_t *mntzone = NULL;
 715 
 716         if (secpolicy_fs_mount(cr, mvp, vfsp) != 0)
 717                 return (EPERM);
 718         if (mvp->v_type != VDIR)
 719                 return (ENOTDIR);
 720 
 721         /*
 722          * get arguments
 723          *
 724          * nfs_args is now versioned and is extensible, so
 725          * uap->datalen might be different from sizeof (args)
 726          * in a compatible situation.
 727          */
 728 more:
 729         if (!(uap->flags & MS_SYSSPACE)) {
 730                 if (args == NULL)
 731                         args = kmem_zalloc(sizeof (struct nfs_args), KM_SLEEP);
 732                 else
 733                         nfs4_free_args(args);
 734                 error = nfs4_copyin(data, uap->datalen, args);
 735                 if (error) {
 736                         if (args) {
 737                                 kmem_free(args, sizeof (*args));
 738                         }
 739                         return (error);
 740                 }
 741         } else {
 742                 args = (struct nfs_args *)data;
 743         }
 744 
 745         flags = args->flags;
 746 
 747         /*
 748          * If the request changes the locking type, disallow the remount,
 749          * because it's questionable whether we can transfer the
 750          * locking state correctly.
 751          */
 752         if (uap->flags & MS_REMOUNT) {
 753                 if (!(uap->flags & MS_SYSSPACE)) {
 754                         nfs4_free_args(args);
 755                         kmem_free(args, sizeof (*args));
 756                 }
 757                 if ((mi = VFTOMI4(vfsp)) != NULL) {
 758                         uint_t new_mi_llock;
 759                         uint_t old_mi_llock;
 760                         new_mi_llock = (flags & NFSMNT_LLOCK) ? 1 : 0;
 761                         old_mi_llock = (mi->mi_flags & MI4_LLOCK) ? 1 : 0;
 762                         if (old_mi_llock != new_mi_llock)
 763                                 return (EBUSY);
 764                 }
 765                 return (0);
 766         }
 767 
 768         /*
 769          * For ephemeral mount trigger stub vnodes, we have two problems
 770          * to solve: racing threads will likely fail the v_count check, and
 771          * we want only one to proceed with the mount.
 772          *
 773          * For stubs, if the mount has already occurred (via a racing thread),
 774          * just return success. If not, skip the v_count check and proceed.
 775          * Note that we are already serialised at this point.
 776          */
 777         mutex_enter(&mvp->v_lock);
 778         if (vn_matchops(mvp, nfs4_trigger_vnodeops)) {
 779                 /* mntpt is a v4 stub vnode */
 780                 ASSERT(RP_ISSTUB(VTOR4(mvp)));
 781                 ASSERT(!(uap->flags & MS_OVERLAY));
 782                 ASSERT(!(mvp->v_flag & VROOT));
 783                 if (vn_mountedvfs(mvp) != NULL) {
 784                         /* ephemeral mount has already occurred */
 785                         ASSERT(uap->flags & MS_SYSSPACE);
 786                         mutex_exit(&mvp->v_lock);
 787                         return (0);
 788                 }
 789         } else {
 790                 /* mntpt is a non-v4 or v4 non-stub vnode */
 791                 if (!(uap->flags & MS_OVERLAY) &&
 792                     (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
 793                         mutex_exit(&mvp->v_lock);
 794                         if (!(uap->flags & MS_SYSSPACE)) {
 795                                 nfs4_free_args(args);
 796                                 kmem_free(args, sizeof (*args));
 797                         }
 798                         return (EBUSY);
 799                 }
 800         }
 801         mutex_exit(&mvp->v_lock);
 802 
 803         /* make sure things are zeroed for errout: */
 804         rtvp = NULL;
 805         mi = NULL;
 806         secdata = NULL;
 807 
 808         /*
 809          * A valid knetconfig structure is required.
 810          */
 811         if (!(flags & NFSMNT_KNCONF) ||
 812             args->knconf == NULL || args->knconf->knc_protofmly == NULL ||
 813             args->knconf->knc_proto == NULL ||
 814             (strcmp(args->knconf->knc_proto, NC_UDP) == 0)) {
 815                 if (!(uap->flags & MS_SYSSPACE)) {
 816                         nfs4_free_args(args);
 817                         kmem_free(args, sizeof (*args));
 818                 }
 819                 return (EINVAL);
 820         }
 821 
 822         if ((strlen(args->knconf->knc_protofmly) >= KNC_STRSIZE) ||
 823             (strlen(args->knconf->knc_proto) >= KNC_STRSIZE)) {
 824                 if (!(uap->flags & MS_SYSSPACE)) {
 825                         nfs4_free_args(args);
 826                         kmem_free(args, sizeof (*args));
 827                 }
 828                 return (EINVAL);
 829         }
 830 
 831         /*
 832          * Allocate a servinfo4 struct.
 833          */
 834         svp = kmem_zalloc(sizeof (*svp), KM_SLEEP);
 835         nfs_rw_init(&svp->sv_lock, NULL, RW_DEFAULT, NULL);
 836         if (svp_tail) {
 837                 svp_2ndlast = svp_tail;
 838                 svp_tail->sv_next = svp;
 839         } else {
 840                 svp_head = svp;
 841                 svp_2ndlast = svp;
 842         }
 843 
 844         svp_tail = svp;
 845         svp->sv_knconf = args->knconf;
 846         args->knconf = NULL;
 847 
 848         /*
 849          * Get server address
 850          */
 851         if (args->addr == NULL || args->addr->buf == NULL) {
 852                 error = EINVAL;
 853                 goto errout;
 854         }
 855 
 856         svp->sv_addr.maxlen = args->addr->maxlen;
 857         svp->sv_addr.len = args->addr->len;
 858         svp->sv_addr.buf = args->addr->buf;
 859         args->addr->buf = NULL;
 860 
 861         /*
 862          * Get the root fhandle
 863          */
 864         if (args->fh == NULL || (strlen(args->fh) >= MAXPATHLEN)) {
 865                 error = EINVAL;
 866                 goto errout;
 867         }
 868 
 869         svp->sv_path = args->fh;
 870         svp->sv_pathlen = strlen(args->fh) + 1;
 871         args->fh = NULL;
 872 
 873         /*
 874          * Get server's hostname
 875          */
 876         if (flags & NFSMNT_HOSTNAME) {
 877                 if (args->hostname == NULL || (strlen(args->hostname) >
 878                     MAXNETNAMELEN)) {
 879                         error = EINVAL;
 880                         goto errout;
 881                 }
 882                 svp->sv_hostnamelen = strlen(args->hostname) + 1;
 883                 svp->sv_hostname = args->hostname;
 884                 args->hostname = NULL;
 885         } else {
 886                 char *p = "unknown-host";
 887                 svp->sv_hostnamelen = strlen(p) + 1;
 888                 svp->sv_hostname = kmem_zalloc(svp->sv_hostnamelen, KM_SLEEP);
 889                 (void) strcpy(svp->sv_hostname, p);
 890         }
 891 
 892         /*
 893          * RDMA MOUNT SUPPORT FOR NFS v4.
 894          * Establish, is it possible to use RDMA, if so overload the
 895          * knconf with rdma specific knconf and free the orignal knconf.
 896          */
 897         if ((flags & NFSMNT_TRYRDMA) || (flags & NFSMNT_DORDMA)) {
 898                 /*
 899                  * Determine the addr type for RDMA, IPv4 or v6.
 900                  */
 901                 if (strcmp(svp->sv_knconf->knc_protofmly, NC_INET) == 0)
 902                         addr_type = AF_INET;
 903                 else if (strcmp(svp->sv_knconf->knc_protofmly, NC_INET6) == 0)
 904                         addr_type = AF_INET6;
 905 
 906                 if (rdma_reachable(addr_type, &svp->sv_addr,
 907                     &rdma_knconf) == 0) {
 908                         /*
 909                          * If successful, hijack the orignal knconf and
 910                          * replace with the new one, depending on the flags.
 911                          */
 912                         svp->sv_origknconf = svp->sv_knconf;
 913                         svp->sv_knconf = rdma_knconf;
 914                 } else {
 915                         if (flags & NFSMNT_TRYRDMA) {
 916 #ifdef  DEBUG
 917                                 if (rdma_debug)
 918                                         zcmn_err(getzoneid(), CE_WARN,
 919                                             "no RDMA onboard, revert\n");
 920 #endif
 921                         }
 922 
 923                         if (flags & NFSMNT_DORDMA) {
 924                                 /*
 925                                  * If proto=rdma is specified and no RDMA
 926                                  * path to this server is avialable then
 927                                  * ditch this server.
 928                                  * This is not included in the mountable
 929                                  * server list or the replica list.
 930                                  * Check if more servers are specified;
 931                                  * Failover case, otherwise bail out of mount.
 932                                  */
 933                                 if (args->nfs_args_ext == NFS_ARGS_EXTB &&
 934                                     args->nfs_ext_u.nfs_extB.next != NULL) {
 935                                         data = (char *)
 936                                             args->nfs_ext_u.nfs_extB.next;
 937                                         if (uap->flags & MS_RDONLY &&
 938                                             !(flags & NFSMNT_SOFT)) {
 939                                                 if (svp_head->sv_next == NULL) {
 940                                                         svp_tail = NULL;
 941                                                         svp_2ndlast = NULL;
 942                                                         sv4_free(svp_head);
 943                                                         goto more;
 944                                                 } else {
 945                                                         svp_tail = svp_2ndlast;
 946                                                         svp_2ndlast->sv_next =
 947                                                             NULL;
 948                                                         sv4_free(svp);
 949                                                         goto more;
 950                                                 }
 951                                         }
 952                                 } else {
 953                                         /*
 954                                          * This is the last server specified
 955                                          * in the nfs_args list passed down
 956                                          * and its not rdma capable.
 957                                          */
 958                                         if (svp_head->sv_next == NULL) {
 959                                                 /*
 960                                                  * Is this the only one
 961                                                  */
 962                                                 error = EINVAL;
 963 #ifdef  DEBUG
 964                                                 if (rdma_debug)
 965                                                         zcmn_err(getzoneid(),
 966                                                             CE_WARN,
 967                                                             "No RDMA srv");
 968 #endif
 969                                                 goto errout;
 970                                         } else {
 971                                                 /*
 972                                                  * There is list, since some
 973                                                  * servers specified before
 974                                                  * this passed all requirements
 975                                                  */
 976                                                 svp_tail = svp_2ndlast;
 977                                                 svp_2ndlast->sv_next = NULL;
 978                                                 sv4_free(svp);
 979                                                 goto proceed;
 980                                         }
 981                                 }
 982                         }
 983                 }
 984         }
 985 
 986         /*
 987          * If there are syncaddr and netname data, load them in. This is
 988          * to support data needed for NFSV4 when AUTH_DH is the negotiated
 989          * flavor via SECINFO. (instead of using MOUNT protocol in V3).
 990          */
 991         if (args->flags & NFSMNT_SECURE) {
 992                 svp->sv_dhsec = create_authdh_data(args->netname,
 993                     strlen(args->netname),
 994                     args->syncaddr, svp->sv_knconf);
 995         }
 996 
 997         /*
 998          * Get the extention data which has the security data structure.
 999          * This includes data for AUTH_SYS as well.
1000          */
1001         if (flags & NFSMNT_NEWARGS) {
1002                 switch (args->nfs_args_ext) {
1003                 case NFS_ARGS_EXTA:
1004                 case NFS_ARGS_EXTB:
1005                         /*
1006                          * Indicating the application is using the new
1007                          * sec_data structure to pass in the security
1008                          * data.
1009                          */
1010                         secdata = args->nfs_ext_u.nfs_extA.secdata;
1011                         if (secdata == NULL) {
1012                                 error = EINVAL;
1013                         } else if (uap->flags & MS_SYSSPACE) {
1014                                 /*
1015                                  * Need to validate the flavor here if
1016                                  * sysspace, userspace was already
1017                                  * validate from the nfs_copyin function.
1018                                  */
1019                                 switch (secdata->rpcflavor) {
1020                                 case AUTH_NONE:
1021                                 case AUTH_UNIX:
1022                                 case AUTH_LOOPBACK:
1023                                 case AUTH_DES:
1024                                 case RPCSEC_GSS:
1025                                         break;
1026                                 default:
1027                                         error = EINVAL;
1028                                         goto errout;
1029                                 }
1030                         }
1031                         args->nfs_ext_u.nfs_extA.secdata = NULL;
1032                         break;
1033 
1034                 default:
1035                         error = EINVAL;
1036                         break;
1037                 }
1038 
1039         } else if (flags & NFSMNT_SECURE) {
1040                 /*
1041                  * NFSMNT_SECURE is deprecated but we keep it
1042                  * to support the rogue user-generated application
1043                  * that may use this undocumented interface to do
1044                  * AUTH_DH security, e.g. our own rexd.
1045                  *
1046                  * Also note that NFSMNT_SECURE is used for passing
1047                  * AUTH_DH info to be used in negotiation.
1048                  */
1049                 secdata = create_authdh_data(args->netname,
1050                     strlen(args->netname), args->syncaddr, svp->sv_knconf);
1051 
1052         } else {
1053                 secdata = kmem_alloc(sizeof (*secdata), KM_SLEEP);
1054                 secdata->secmod = secdata->rpcflavor = AUTH_SYS;
1055                 secdata->data = NULL;
1056         }
1057 
1058         svp->sv_secdata = secdata;
1059 
1060         /*
1061          * User does not explictly specify a flavor, and a user
1062          * defined default flavor is passed down.
1063          */
1064         if (flags & NFSMNT_SECDEFAULT) {
1065                 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
1066                 svp->sv_flags |= SV4_TRYSECDEFAULT;
1067                 nfs_rw_exit(&svp->sv_lock);
1068         }
1069 
1070         /*
1071          * Failover support:
1072          *
1073          * We may have a linked list of nfs_args structures,
1074          * which means the user is looking for failover.  If
1075          * the mount is either not "read-only" or "soft",
1076          * we want to bail out with EINVAL.
1077          */
1078         if (args->nfs_args_ext == NFS_ARGS_EXTB &&
1079             args->nfs_ext_u.nfs_extB.next != NULL) {
1080                 if (uap->flags & MS_RDONLY && !(flags & NFSMNT_SOFT)) {
1081                         data = (char *)args->nfs_ext_u.nfs_extB.next;
1082                         goto more;
1083                 }
1084                 error = EINVAL;
1085                 goto errout;
1086         }
1087 
1088         /*
1089          * Determine the zone we're being mounted into.
1090          */
1091         zone_hold(mntzone = zone);              /* start with this assumption */
1092         if (getzoneid() == GLOBAL_ZONEID) {
1093                 zone_rele(mntzone);
1094                 mntzone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt));
1095                 ASSERT(mntzone != NULL);
1096                 if (mntzone != zone) {
1097                         error = EBUSY;
1098                         goto errout;
1099                 }
1100         }
1101 
1102         if (is_system_labeled()) {
1103                 error = nfs_mount_label_policy(vfsp, &svp->sv_addr,
1104                     svp->sv_knconf, cr);
1105 
1106                 if (error > 0)
1107                         goto errout;
1108 
1109                 if (error == -1) {
1110                         /* change mount to read-only to prevent write-down */
1111                         vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
1112                 }
1113         }
1114 
1115         /*
1116          * Stop the mount from going any further if the zone is going away.
1117          */
1118         if (zone_status_get(mntzone) >= ZONE_IS_SHUTTING_DOWN) {
1119                 error = EBUSY;
1120                 goto errout;
1121         }
1122 
1123         /*
1124          * Get root vnode.
1125          */
1126 proceed:
1127         error = nfs4rootvp(&rtvp, vfsp, svp_head, flags, cr, mntzone);
1128         if (error) {
1129                 /* if nfs4rootvp failed, it will free svp_head */
1130                 svp_head = NULL;
1131                 goto errout;
1132         }
1133 
1134         mi = VTOMI4(rtvp);
1135 
1136         /*
1137          * Send client id to the server, if necessary
1138          */
1139         nfs4_error_zinit(&n4e);
1140         nfs4setclientid(mi, cr, FALSE, &n4e);
1141 
1142         error = n4e.error;
1143 
1144         if (error)
1145                 goto errout;
1146 
1147         /*
1148          * Set option fields in the mount info record
1149          */
1150 
1151         if (svp_head->sv_next) {
1152                 mutex_enter(&mi->mi_lock);
1153                 mi->mi_flags |= MI4_LLOCK;
1154                 mutex_exit(&mi->mi_lock);
1155         }
1156         error = nfs4_setopts(rtvp, DATAMODEL_NATIVE, args);
1157         if (error)
1158                 goto errout;
1159 
1160         /*
1161          * Time to tie in the mirror mount info at last!
1162          */
1163         if (flags & NFSMNT_EPHEMERAL)
1164                 error = nfs4_record_ephemeral_mount(mi, mvp);
1165 
1166 errout:
1167         if (error) {
1168                 if (rtvp != NULL) {
1169                         rp = VTOR4(rtvp);
1170                         if (rp->r_flags & R4HASHED)
1171                                 rp4_rmhash(rp);
1172                 }
1173                 if (mi != NULL) {
1174                         nfs4_async_stop(vfsp);
1175                         nfs4_async_manager_stop(vfsp);
1176                         nfs4_remove_mi_from_server(mi, NULL);
1177                         if (rtvp != NULL)
1178                                 VN_RELE(rtvp);
1179                         if (mntzone != NULL)
1180                                 zone_rele(mntzone);
1181                         /* need to remove it from the zone */
1182                         removed = nfs4_mi_zonelist_remove(mi);
1183                         if (removed)
1184                                 zone_rele_ref(&mi->mi_zone_ref,
1185                                     ZONE_REF_NFSV4);
1186                         MI4_RELE(mi);
1187                         if (!(uap->flags & MS_SYSSPACE) && args) {
1188                                 nfs4_free_args(args);
1189                                 kmem_free(args, sizeof (*args));
1190                         }
1191                         return (error);
1192                 }
1193                 if (svp_head)
1194                         sv4_free(svp_head);
1195         }
1196 
1197         if (!(uap->flags & MS_SYSSPACE) && args) {
1198                 nfs4_free_args(args);
1199                 kmem_free(args, sizeof (*args));
1200         }
1201         if (rtvp != NULL)
1202                 VN_RELE(rtvp);
1203 
1204         if (mntzone != NULL)
1205                 zone_rele(mntzone);
1206 
1207         return (error);
1208 }
1209 
1210 #ifdef  DEBUG
1211 #define VERS_MSG        "NFS4 server "
1212 #else
1213 #define VERS_MSG        "NFS server "
1214 #endif
1215 
1216 #define READ_MSG        \
1217         VERS_MSG "%s returned 0 for read transfer size"
1218 #define WRITE_MSG       \
1219         VERS_MSG "%s returned 0 for write transfer size"
1220 #define SIZE_MSG        \
1221         VERS_MSG "%s returned 0 for maximum file size"
1222 
1223 /*
1224  * Get the symbolic link text from the server for a given filehandle
1225  * of that symlink.
1226  *
1227  *      (get symlink text) PUTFH READLINK
1228  */
1229 static int
1230 getlinktext_otw(mntinfo4_t *mi, nfs_fh4 *fh, char **linktextp, cred_t *cr,
1231     int flags)
1232 {
1233         COMPOUND4args_clnt args;
1234         COMPOUND4res_clnt res;
1235         int doqueue;
1236         nfs_argop4 argop[2];
1237         nfs_resop4 *resop;
1238         READLINK4res *lr_res;
1239         uint_t len;
1240         bool_t needrecov = FALSE;
1241         nfs4_recov_state_t recov_state;
1242         nfs4_sharedfh_t *sfh;
1243         nfs4_error_t e;
1244         int num_retry = nfs4_max_mount_retry;
1245         int recovery = !(flags & NFS4_GETFH_NEEDSOP);
1246 
1247         sfh = sfh4_get(fh, mi);
1248         recov_state.rs_flags = 0;
1249         recov_state.rs_num_retry_despite_err = 0;
1250 
1251 recov_retry:
1252         nfs4_error_zinit(&e);
1253 
1254         args.array_len = 2;
1255         args.array = argop;
1256         args.ctag = TAG_GET_SYMLINK;
1257 
1258         if (! recovery) {
1259                 e.error = nfs4_start_op(mi, NULL, NULL, &recov_state);
1260                 if (e.error) {
1261                         sfh4_rele(&sfh);
1262                         return (e.error);
1263                 }
1264         }
1265 
1266         /* 0. putfh symlink fh */
1267         argop[0].argop = OP_CPUTFH;
1268         argop[0].nfs_argop4_u.opcputfh.sfh = sfh;
1269 
1270         /* 1. readlink */
1271         argop[1].argop = OP_READLINK;
1272 
1273         doqueue = 1;
1274 
1275         rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
1276 
1277         needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
1278 
1279         if (needrecov && !recovery && num_retry-- > 0) {
1280 
1281                 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1282                     "getlinktext_otw: initiating recovery\n"));
1283 
1284                 if (nfs4_start_recovery(&e, mi, NULL, NULL, NULL, NULL,
1285                     OP_READLINK, NULL, NULL, NULL) == FALSE) {
1286                         nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
1287                         if (!e.error)
1288                                 (void) xdr_free(xdr_COMPOUND4res_clnt,
1289                                     (caddr_t)&res);
1290                         goto recov_retry;
1291                 }
1292         }
1293 
1294         /*
1295          * If non-NFS4 pcol error and/or we weren't able to recover.
1296          */
1297         if (e.error != 0) {
1298                 if (! recovery)
1299                         nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
1300                 sfh4_rele(&sfh);
1301                 return (e.error);
1302         }
1303 
1304         if (res.status) {
1305                 e.error = geterrno4(res.status);
1306                 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1307                 if (! recovery)
1308                         nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
1309                 sfh4_rele(&sfh);
1310                 return (e.error);
1311         }
1312 
1313         /* res.status == NFS4_OK */
1314         ASSERT(res.status == NFS4_OK);
1315 
1316         resop = &res.array[1];  /* readlink res */
1317         lr_res = &resop->nfs_resop4_u.opreadlink;
1318 
1319         /* treat symlink name as data */
1320         *linktextp = utf8_to_str((utf8string *)&lr_res->link, &len, NULL);
1321 
1322         if (! recovery)
1323                 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
1324         sfh4_rele(&sfh);
1325         (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1326         return (0);
1327 }
1328 
1329 /*
1330  * Skip over consecutive slashes and "/./" in a pathname.
1331  */
1332 void
1333 pathname_skipslashdot(struct pathname *pnp)
1334 {
1335         char *c1, *c2;
1336 
1337         while (pnp->pn_pathlen > 0 && *pnp->pn_path == '/') {
1338 
1339                 c1 = pnp->pn_path + 1;
1340                 c2 = pnp->pn_path + 2;
1341 
1342                 if (*c1 == '.' && (*c2 == '/' || *c2 == '\0')) {
1343                         pnp->pn_path = pnp->pn_path + 2; /* skip "/." */
1344                         pnp->pn_pathlen = pnp->pn_pathlen - 2;
1345                 } else {
1346                         pnp->pn_path++;
1347                         pnp->pn_pathlen--;
1348                 }
1349         }
1350 }
1351 
1352 /*
1353  * Resolve a symbolic link path. The symlink is in the nth component of
1354  * svp->sv_path and has an nfs4 file handle "fh".
1355  * Upon return, the sv_path will point to the new path that has the nth
1356  * component resolved to its symlink text.
1357  */
1358 int
1359 resolve_sympath(mntinfo4_t *mi, servinfo4_t *svp, int nth, nfs_fh4 *fh,
1360     cred_t *cr, int flags)
1361 {
1362         char *oldpath;
1363         char *symlink, *newpath;
1364         struct pathname oldpn, newpn;
1365         char component[MAXNAMELEN];
1366         int i, addlen, error = 0;
1367         int oldpathlen;
1368 
1369         /* Get the symbolic link text over the wire. */
1370         error = getlinktext_otw(mi, fh, &symlink, cr, flags);
1371 
1372         if (error || symlink == NULL || strlen(symlink) == 0)
1373                 return (error);
1374 
1375         /*
1376          * Compose the new pathname.
1377          * Note:
1378          *    - only the nth component is resolved for the pathname.
1379          *    - pathname.pn_pathlen does not count the ending null byte.
1380          */
1381         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1382         oldpath = svp->sv_path;
1383         oldpathlen = svp->sv_pathlen;
1384         if (error = pn_get(oldpath, UIO_SYSSPACE, &oldpn)) {
1385                 nfs_rw_exit(&svp->sv_lock);
1386                 kmem_free(symlink, strlen(symlink) + 1);
1387                 return (error);
1388         }
1389         nfs_rw_exit(&svp->sv_lock);
1390         pn_alloc(&newpn);
1391 
1392         /*
1393          * Skip over previous components from the oldpath so that the
1394          * oldpn.pn_path will point to the symlink component. Skip
1395          * leading slashes and "/./" (no OP_LOOKUP on ".") so that
1396          * pn_getcompnent can get the component.
1397          */
1398         for (i = 1; i < nth; i++) {
1399                 pathname_skipslashdot(&oldpn);
1400                 error = pn_getcomponent(&oldpn, component);
1401                 if (error)
1402                         goto out;
1403         }
1404 
1405         /*
1406          * Copy the old path upto the component right before the symlink
1407          * if the symlink is not an absolute path.
1408          */
1409         if (symlink[0] != '/') {
1410                 addlen = oldpn.pn_path - oldpn.pn_buf;
1411                 bcopy(oldpn.pn_buf, newpn.pn_path, addlen);
1412                 newpn.pn_pathlen += addlen;
1413                 newpn.pn_path += addlen;
1414                 newpn.pn_buf[newpn.pn_pathlen] = '/';
1415                 newpn.pn_pathlen++;
1416                 newpn.pn_path++;
1417         }
1418 
1419         /* copy the resolved symbolic link text */
1420         addlen = strlen(symlink);
1421         if (newpn.pn_pathlen + addlen >= newpn.pn_bufsize) {
1422                 error = ENAMETOOLONG;
1423                 goto out;
1424         }
1425         bcopy(symlink, newpn.pn_path, addlen);
1426         newpn.pn_pathlen += addlen;
1427         newpn.pn_path += addlen;
1428 
1429         /*
1430          * Check if there is any remaining path after the symlink component.
1431          * First, skip the symlink component.
1432          */
1433         pathname_skipslashdot(&oldpn);
1434         if (error = pn_getcomponent(&oldpn, component))
1435                 goto out;
1436 
1437         addlen = pn_pathleft(&oldpn); /* includes counting the slash */
1438 
1439         /*
1440          * Copy the remaining path to the new pathname if there is any.
1441          */
1442         if (addlen > 0) {
1443                 if (newpn.pn_pathlen + addlen >= newpn.pn_bufsize) {
1444                         error = ENAMETOOLONG;
1445                         goto out;
1446                 }
1447                 bcopy(oldpn.pn_path, newpn.pn_path, addlen);
1448                 newpn.pn_pathlen += addlen;
1449         }
1450         newpn.pn_buf[newpn.pn_pathlen] = '\0';
1451 
1452         /* get the newpath and store it in the servinfo4_t */
1453         newpath = kmem_alloc(newpn.pn_pathlen + 1, KM_SLEEP);
1454         bcopy(newpn.pn_buf, newpath, newpn.pn_pathlen);
1455         newpath[newpn.pn_pathlen] = '\0';
1456 
1457         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
1458         svp->sv_path = newpath;
1459         svp->sv_pathlen = strlen(newpath) + 1;
1460         nfs_rw_exit(&svp->sv_lock);
1461 
1462         kmem_free(oldpath, oldpathlen);
1463 out:
1464         kmem_free(symlink, strlen(symlink) + 1);
1465         pn_free(&newpn);
1466         pn_free(&oldpn);
1467 
1468         return (error);
1469 }
1470 
1471 /*
1472  * This routine updates servinfo4 structure with the new referred server
1473  * info.
1474  * nfsfsloc has the location related information
1475  * fsp has the hostname and pathname info.
1476  * new path = pathname from referral + part of orig pathname(based on nth).
1477  */
1478 static void
1479 update_servinfo4(servinfo4_t *svp, fs_location4 *fsp,
1480     struct nfs_fsl_info *nfsfsloc, char *orig_path, int nth)
1481 {
1482         struct knetconfig *knconf, *svknconf;
1483         struct netbuf *saddr;
1484         sec_data_t      *secdata;
1485         utf8string *host;
1486         int i = 0, num_slashes = 0;
1487         char *p, *spath, *op, *new_path;
1488 
1489         /* Update knconf */
1490         knconf = svp->sv_knconf;
1491         free_knconf_contents(knconf);
1492         bzero(knconf, sizeof (struct knetconfig));
1493         svknconf = nfsfsloc->knconf;
1494         knconf->knc_semantics = svknconf->knc_semantics;
1495         knconf->knc_protofmly = kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
1496         knconf->knc_proto = kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
1497         knconf->knc_rdev = svknconf->knc_rdev;
1498         bcopy(svknconf->knc_protofmly, knconf->knc_protofmly, KNC_STRSIZE);
1499         bcopy(svknconf->knc_proto, knconf->knc_proto, KNC_STRSIZE);
1500 
1501         /* Update server address */
1502         saddr = &svp->sv_addr;
1503         if (saddr->buf != NULL)
1504                 kmem_free(saddr->buf, saddr->maxlen);
1505         saddr->buf  = kmem_alloc(nfsfsloc->addr->maxlen, KM_SLEEP);
1506         saddr->len = nfsfsloc->addr->len;
1507         saddr->maxlen = nfsfsloc->addr->maxlen;
1508         bcopy(nfsfsloc->addr->buf, saddr->buf, nfsfsloc->addr->len);
1509 
1510         /* Update server name */
1511         host = fsp->server_val;
1512         kmem_free(svp->sv_hostname, svp->sv_hostnamelen);
1513         svp->sv_hostname = kmem_zalloc(host->utf8string_len + 1, KM_SLEEP);
1514         bcopy(host->utf8string_val, svp->sv_hostname, host->utf8string_len);
1515         svp->sv_hostname[host->utf8string_len] = '\0';
1516         svp->sv_hostnamelen = host->utf8string_len + 1;
1517 
1518         /*
1519          * Update server path.
1520          * We need to setup proper path here.
1521          * For ex., If we got a path name serv1:/rp/aaa/bbb
1522          * where aaa is a referral and points to serv2:/rpool/aa
1523          * we need to set the path to serv2:/rpool/aa/bbb
1524          * The first part of this below code generates /rpool/aa
1525          * and the second part appends /bbb to the server path.
1526          */
1527         spath = p = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
1528         *p++ = '/';
1529         for (i = 0; i < fsp->rootpath.pathname4_len; i++) {
1530                 component4 *comp;
1531 
1532                 comp = &fsp->rootpath.pathname4_val[i];
1533                 /* If no space, null the string and bail */
1534                 if ((p - spath) + comp->utf8string_len + 1 > MAXPATHLEN) {
1535                         p = spath + MAXPATHLEN - 1;
1536                         spath[0] = '\0';
1537                         break;
1538                 }
1539                 bcopy(comp->utf8string_val, p, comp->utf8string_len);
1540                 p += comp->utf8string_len;
1541                 *p++ = '/';
1542         }
1543         if (fsp->rootpath.pathname4_len != 0)
1544                 *(p - 1) = '\0';
1545         else
1546                 *p = '\0';
1547         p = spath;
1548 
1549         new_path = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
1550         (void) strlcpy(new_path, p, MAXPATHLEN);
1551         kmem_free(p, MAXPATHLEN);
1552         i = strlen(new_path);
1553 
1554         for (op = orig_path; *op; op++) {
1555                 if (*op == '/')
1556                         num_slashes++;
1557                 if (num_slashes == nth + 2) {
1558                         while (*op != '\0') {
1559                                 new_path[i] = *op;
1560                                 i++;
1561                                 op++;
1562                         }
1563                         break;
1564                 }
1565         }
1566         new_path[i] = '\0';
1567 
1568         kmem_free(svp->sv_path, svp->sv_pathlen);
1569         svp->sv_pathlen = strlen(new_path) + 1;
1570         svp->sv_path = kmem_alloc(svp->sv_pathlen, KM_SLEEP);
1571         bcopy(new_path, svp->sv_path, svp->sv_pathlen);
1572         kmem_free(new_path, MAXPATHLEN);
1573 
1574         /*
1575          * All the security data is specific to old server.
1576          * Clean it up except secdata which deals with mount options.
1577          * We need to inherit that data. Copy secdata into our new servinfo4.
1578          */
1579         if (svp->sv_dhsec) {
1580                 sec_clnt_freeinfo(svp->sv_dhsec);
1581                 svp->sv_dhsec = NULL;
1582         }
1583         if (svp->sv_save_secinfo &&
1584             svp->sv_save_secinfo != svp->sv_secinfo) {
1585                 secinfo_free(svp->sv_save_secinfo);
1586                 svp->sv_save_secinfo = NULL;
1587         }
1588         if (svp->sv_secinfo) {
1589                 secinfo_free(svp->sv_secinfo);
1590                 svp->sv_secinfo = NULL;
1591         }
1592         svp->sv_currsec = NULL;
1593 
1594         secdata = kmem_alloc(sizeof (*secdata), KM_SLEEP);
1595         *secdata = *svp->sv_secdata;
1596         secdata->data = NULL;
1597         if (svp->sv_secdata) {
1598                 sec_clnt_freeinfo(svp->sv_secdata);
1599                 svp->sv_secdata = NULL;
1600         }
1601         svp->sv_secdata = secdata;
1602 }
1603 
1604 /*
1605  * Resolve a referral. The referral is in the n+1th component of
1606  * svp->sv_path and has a parent nfs4 file handle "fh".
1607  * Upon return, the sv_path will point to the new path that has referral
1608  * component resolved to its referred path and part of original path.
1609  * Hostname and other address information is also updated.
1610  */
1611 int
1612 resolve_referral(mntinfo4_t *mi, servinfo4_t *svp, cred_t *cr, int nth,
1613     nfs_fh4 *fh)
1614 {
1615         nfs4_sharedfh_t *sfh;
1616         struct nfs_fsl_info nfsfsloc;
1617         nfs4_ga_res_t garp;
1618         COMPOUND4res_clnt callres;
1619         fs_location4    *fsp;
1620         char *nm, *orig_path;
1621         int orig_pathlen = 0, ret = -1, index;
1622 
1623         if (svp->sv_pathlen <= 0)
1624                 return (ret);
1625 
1626         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
1627         orig_pathlen = svp->sv_pathlen;
1628         orig_path = kmem_alloc(orig_pathlen, KM_SLEEP);
1629         bcopy(svp->sv_path, orig_path, orig_pathlen);
1630         nm = extract_referral_point(svp->sv_path, nth);
1631         setup_newsvpath(svp, nth);
1632         nfs_rw_exit(&svp->sv_lock);
1633 
1634         sfh = sfh4_get(fh, mi);
1635         index = nfs4_process_referral(mi, sfh, nm, cr,
1636             &garp, &callres, &nfsfsloc);
1637         sfh4_rele(&sfh);
1638         kmem_free(nm, MAXPATHLEN);
1639         if (index < 0) {
1640                 kmem_free(orig_path, orig_pathlen);
1641                 return (index);
1642         }
1643 
1644         fsp =  &garp.n4g_ext_res->n4g_fslocations.locations_val[index];
1645         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
1646         update_servinfo4(svp, fsp, &nfsfsloc, orig_path, nth);
1647         nfs_rw_exit(&svp->sv_lock);
1648 
1649         mutex_enter(&mi->mi_lock);
1650         mi->mi_vfs_referral_loop_cnt++;
1651         mutex_exit(&mi->mi_lock);
1652 
1653         ret = 0;
1654 bad:
1655         /* Free up XDR memory allocated in nfs4_process_referral() */
1656         xdr_free(xdr_nfs_fsl_info, (char *)&nfsfsloc);
1657         xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres);
1658         kmem_free(orig_path, orig_pathlen);
1659 
1660         return (ret);
1661 }
1662 
1663 /*
1664  * Get the root filehandle for the given filesystem and server, and update
1665  * svp.
1666  *
1667  * If NFS4_GETFH_NEEDSOP is set, then use nfs4_start_fop and nfs4_end_fop
1668  * to coordinate with recovery.  Otherwise, the caller is assumed to be
1669  * the recovery thread or have already done a start_fop.
1670  *
1671  * Errors are returned by the nfs4_error_t parameter.
1672  */
1673 static void
1674 nfs4getfh_otw(struct mntinfo4 *mi, servinfo4_t *svp, vtype_t *vtp,
1675     int flags, cred_t *cr, nfs4_error_t *ep)
1676 {
1677         COMPOUND4args_clnt args;
1678         COMPOUND4res_clnt res;
1679         int doqueue = 1;
1680         nfs_argop4 *argop;
1681         nfs_resop4 *resop;
1682         nfs4_ga_res_t *garp;
1683         int num_argops;
1684         lookup4_param_t lookuparg;
1685         nfs_fh4 *tmpfhp;
1686         nfs_fh4 *resfhp;
1687         bool_t needrecov = FALSE;
1688         nfs4_recov_state_t recov_state;
1689         int llndx;
1690         int nthcomp;
1691         int recovery = !(flags & NFS4_GETFH_NEEDSOP);
1692 
1693         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1694         ASSERT(svp->sv_path != NULL);
1695         if (svp->sv_path[0] == '\0') {
1696                 nfs_rw_exit(&svp->sv_lock);
1697                 nfs4_error_init(ep, EINVAL);
1698                 return;
1699         }
1700         nfs_rw_exit(&svp->sv_lock);
1701 
1702         recov_state.rs_flags = 0;
1703         recov_state.rs_num_retry_despite_err = 0;
1704 
1705 recov_retry:
1706         if (mi->mi_vfs_referral_loop_cnt >= NFS4_REFERRAL_LOOP_MAX) {
1707                 DTRACE_PROBE3(nfs4clnt__debug__referral__loop, mntinfo4 *,
1708                     mi, servinfo4_t *, svp, char *, "nfs4getfh_otw");
1709                 nfs4_error_init(ep, EINVAL);
1710                 return;
1711         }
1712         nfs4_error_zinit(ep);
1713 
1714         if (!recovery) {
1715                 ep->error = nfs4_start_fop(mi, NULL, NULL, OH_MOUNT,
1716                     &recov_state, NULL);
1717 
1718                 /*
1719                  * If recovery has been started and this request as
1720                  * initiated by a mount, then we must wait for recovery
1721                  * to finish before proceeding, otherwise, the error
1722                  * cleanup would remove data structures needed by the
1723                  * recovery thread.
1724                  */
1725                 if (ep->error) {
1726                         mutex_enter(&mi->mi_lock);
1727                         if (mi->mi_flags & MI4_MOUNTING) {
1728                                 mi->mi_flags |= MI4_RECOV_FAIL;
1729                                 mi->mi_error = EIO;
1730 
1731                                 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1732                                     "nfs4getfh_otw: waiting 4 recovery\n"));
1733 
1734                                 while (mi->mi_flags & MI4_RECOV_ACTIV)
1735                                         cv_wait(&mi->mi_failover_cv,
1736                                             &mi->mi_lock);
1737                         }
1738                         mutex_exit(&mi->mi_lock);
1739                         return;
1740                 }
1741 
1742                 /*
1743                  * If the client does not specify a specific flavor to use
1744                  * and has not gotten a secinfo list from the server yet,
1745                  * retrieve the secinfo list from the server and use a
1746                  * flavor from the list to mount.
1747                  *
1748                  * If fail to get the secinfo list from the server, then
1749                  * try the default flavor.
1750                  */
1751                 if ((svp->sv_flags & SV4_TRYSECDEFAULT) &&
1752                     svp->sv_secinfo == NULL) {
1753                         (void) nfs4_secinfo_path(mi, cr, FALSE);
1754                 }
1755         }
1756 
1757         if (recovery)
1758                 args.ctag = TAG_REMAP_MOUNT;
1759         else
1760                 args.ctag = TAG_MOUNT;
1761 
1762         lookuparg.l4_getattrs = LKP4_ALL_ATTRIBUTES;
1763         lookuparg.argsp = &args;
1764         lookuparg.resp = &res;
1765         lookuparg.header_len = 2;       /* Putrootfh, getfh */
1766         lookuparg.trailer_len = 0;
1767         lookuparg.ga_bits = FATTR4_FSINFO_MASK;
1768         lookuparg.mi = mi;
1769 
1770         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1771         ASSERT(svp->sv_path != NULL);
1772         llndx = nfs4lookup_setup(svp->sv_path, &lookuparg, 0);
1773         nfs_rw_exit(&svp->sv_lock);
1774 
1775         argop = args.array;
1776         num_argops = args.array_len;
1777 
1778         /* choose public or root filehandle */
1779         if (flags & NFS4_GETFH_PUBLIC)
1780                 argop[0].argop = OP_PUTPUBFH;
1781         else
1782                 argop[0].argop = OP_PUTROOTFH;
1783 
1784         /* get fh */
1785         argop[1].argop = OP_GETFH;
1786 
1787         NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
1788             "nfs4getfh_otw: %s call, mi 0x%p",
1789             needrecov ? "recov" : "first", (void *)mi));
1790 
1791         rfs4call(mi, &args, &res, cr, &doqueue, RFSCALL_SOFT, ep);
1792 
1793         needrecov = nfs4_needs_recovery(ep, FALSE, mi->mi_vfsp);
1794 
1795         if (needrecov) {
1796                 bool_t abort;
1797 
1798                 if (recovery) {
1799                         nfs4args_lookup_free(argop, num_argops);
1800                         kmem_free(argop,
1801                             lookuparg.arglen * sizeof (nfs_argop4));
1802                         if (!ep->error)
1803                                 (void) xdr_free(xdr_COMPOUND4res_clnt,
1804                                     (caddr_t)&res);
1805                         return;
1806                 }
1807 
1808                 NFS4_DEBUG(nfs4_client_recov_debug,
1809                     (CE_NOTE, "nfs4getfh_otw: initiating recovery\n"));
1810 
1811                 abort = nfs4_start_recovery(ep, mi, NULL,
1812                     NULL, NULL, NULL, OP_GETFH, NULL, NULL, NULL);
1813                 if (!ep->error) {
1814                         ep->error = geterrno4(res.status);
1815                         (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1816                 }
1817                 nfs4args_lookup_free(argop, num_argops);
1818                 kmem_free(argop, lookuparg.arglen * sizeof (nfs_argop4));
1819                 nfs4_end_fop(mi, NULL, NULL, OH_MOUNT, &recov_state, needrecov);
1820                 /* have another go? */
1821                 if (abort == FALSE)
1822                         goto recov_retry;
1823                 return;
1824         }
1825 
1826         /*
1827          * No recovery, but check if error is set.
1828          */
1829         if (ep->error)  {
1830                 nfs4args_lookup_free(argop, num_argops);
1831                 kmem_free(argop, lookuparg.arglen * sizeof (nfs_argop4));
1832                 if (!recovery)
1833                         nfs4_end_fop(mi, NULL, NULL, OH_MOUNT, &recov_state,
1834                             needrecov);
1835                 return;
1836         }
1837 
1838 is_link_err:
1839 
1840         /* for non-recovery errors */
1841         if (res.status && res.status != NFS4ERR_SYMLINK &&
1842             res.status != NFS4ERR_MOVED) {
1843                 if (!recovery) {
1844                         nfs4_end_fop(mi, NULL, NULL, OH_MOUNT, &recov_state,
1845                             needrecov);
1846                 }
1847                 nfs4args_lookup_free(argop, num_argops);
1848                 kmem_free(argop, lookuparg.arglen * sizeof (nfs_argop4));
1849                 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1850                 return;
1851         }
1852 
1853         /*
1854          * If any intermediate component in the path is a symbolic link,
1855          * resolve the symlink, then try mount again using the new path.
1856          */
1857         if (res.status == NFS4ERR_SYMLINK || res.status == NFS4ERR_MOVED) {
1858                 int where;
1859 
1860                 /*
1861                  * Need to call nfs4_end_op before resolve_sympath to avoid
1862                  * potential nfs4_start_op deadlock.
1863                  */
1864                 if (!recovery)
1865                         nfs4_end_fop(mi, NULL, NULL, OH_MOUNT, &recov_state,
1866                             needrecov);
1867 
1868                 /*
1869                  * This must be from OP_LOOKUP failure. The (cfh) for this
1870                  * OP_LOOKUP is a symlink node. Found out where the
1871                  * OP_GETFH is for the (cfh) that is a symlink node.
1872                  *
1873                  * Example:
1874                  * (mount) PUTROOTFH, GETFH, LOOKUP comp1, GETFH, GETATTR,
1875                  * LOOKUP comp2, GETFH, GETATTR, LOOKUP comp3, GETFH, GETATTR
1876                  *
1877                  * LOOKUP comp3 fails with SYMLINK because comp2 is a symlink.
1878                  * In this case, where = 7, nthcomp = 2.
1879                  */
1880                 where = res.array_len - 2;
1881                 ASSERT(where > 0);
1882 
1883                 if (res.status == NFS4ERR_SYMLINK) {
1884 
1885                         resop = &res.array[where - 1];
1886                         ASSERT(resop->resop == OP_GETFH);
1887                         tmpfhp = &resop->nfs_resop4_u.opgetfh.object;
1888                         nthcomp = res.array_len/3 - 1;
1889                         ep->error = resolve_sympath(mi, svp, nthcomp,
1890                             tmpfhp, cr, flags);
1891 
1892                 } else if (res.status == NFS4ERR_MOVED) {
1893 
1894                         resop = &res.array[where - 2];
1895                         ASSERT(resop->resop == OP_GETFH);
1896                         tmpfhp = &resop->nfs_resop4_u.opgetfh.object;
1897                         nthcomp = res.array_len/3 - 1;
1898                         ep->error = resolve_referral(mi, svp, cr, nthcomp,
1899                             tmpfhp);
1900                 }
1901 
1902                 nfs4args_lookup_free(argop, num_argops);
1903                 kmem_free(argop, lookuparg.arglen * sizeof (nfs_argop4));
1904                 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1905 
1906                 if (ep->error)
1907                         return;
1908 
1909                 goto recov_retry;
1910         }
1911 
1912         /* getfh */
1913         resop = &res.array[res.array_len - 2];
1914         ASSERT(resop->resop == OP_GETFH);
1915         resfhp = &resop->nfs_resop4_u.opgetfh.object;
1916 
1917         /* getattr fsinfo res */
1918         resop++;
1919         garp = &resop->nfs_resop4_u.opgetattr.ga_res;
1920 
1921         *vtp = garp->n4g_va.va_type;
1922 
1923         mi->mi_fh_expire_type = garp->n4g_ext_res->n4g_fet;
1924 
1925         mutex_enter(&mi->mi_lock);
1926         if (garp->n4g_ext_res->n4g_pc4.pc4_link_support)
1927                 mi->mi_flags |= MI4_LINK;
1928         if (garp->n4g_ext_res->n4g_pc4.pc4_symlink_support)
1929                 mi->mi_flags |= MI4_SYMLINK;
1930         if (garp->n4g_ext_res->n4g_suppattrs & FATTR4_ACL_MASK)
1931                 mi->mi_flags |= MI4_ACL;
1932         mutex_exit(&mi->mi_lock);
1933 
1934         if (garp->n4g_ext_res->n4g_maxread == 0)
1935                 mi->mi_tsize =
1936                     MIN(MAXBSIZE, mi->mi_tsize);
1937         else
1938                 mi->mi_tsize =
1939                     MIN(garp->n4g_ext_res->n4g_maxread,
1940                     mi->mi_tsize);
1941 
1942         if (garp->n4g_ext_res->n4g_maxwrite == 0)
1943                 mi->mi_stsize =
1944                     MIN(MAXBSIZE, mi->mi_stsize);
1945         else
1946                 mi->mi_stsize =
1947                     MIN(garp->n4g_ext_res->n4g_maxwrite,
1948                     mi->mi_stsize);
1949 
1950         if (garp->n4g_ext_res->n4g_maxfilesize != 0)
1951                 mi->mi_maxfilesize =
1952                     MIN(garp->n4g_ext_res->n4g_maxfilesize,
1953                     mi->mi_maxfilesize);
1954 
1955         /*
1956          * If the final component is a a symbolic link, resolve the symlink,
1957          * then try mount again using the new path.
1958          *
1959          * Assume no symbolic link for root filesysm "/".
1960          */
1961         if (*vtp == VLNK) {
1962                 /*
1963                  * nthcomp is the total result length minus
1964                  * the 1st 2 OPs (PUTROOTFH, GETFH),
1965                  * then divided by 3 (LOOKUP,GETFH,GETATTR)
1966                  *
1967                  * e.g. PUTROOTFH GETFH LOOKUP 1st-comp GETFH GETATTR
1968                  *      LOOKUP 2nd-comp GETFH GETATTR
1969                  *
1970                  *      (8 - 2)/3 = 2
1971                  */
1972                 nthcomp = (res.array_len - 2)/3;
1973 
1974                 /*
1975                  * Need to call nfs4_end_op before resolve_sympath to avoid
1976                  * potential nfs4_start_op deadlock. See RFE 4777612.
1977                  */
1978                 if (!recovery)
1979                         nfs4_end_fop(mi, NULL, NULL, OH_MOUNT, &recov_state,
1980                             needrecov);
1981 
1982                 ep->error = resolve_sympath(mi, svp, nthcomp, resfhp, cr,
1983                     flags);
1984 
1985                 nfs4args_lookup_free(argop, num_argops);
1986                 kmem_free(argop, lookuparg.arglen * sizeof (nfs_argop4));
1987                 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1988 
1989                 if (ep->error)
1990                         return;
1991 
1992                 goto recov_retry;
1993         }
1994 
1995         /*
1996          * We need to figure out where in the compound the getfh
1997          * for the parent directory is. If the object to be mounted is
1998          * the root, then there is no lookup at all:
1999          * PUTROOTFH, GETFH.
2000          * If the object to be mounted is in the root, then the compound is:
2001          * PUTROOTFH, GETFH, LOOKUP, GETFH, GETATTR.
2002          * In either of these cases, the index of the GETFH is 1.
2003          * If it is not at the root, then it's something like:
2004          * PUTROOTFH, GETFH, LOOKUP, GETFH, GETATTR,
2005          * LOOKUP, GETFH, GETATTR
2006          * In this case, the index is llndx (last lookup index) - 2.
2007          */
2008         if (llndx == -1 || llndx == 2)
2009                 resop = &res.array[1];
2010         else {
2011                 ASSERT(llndx > 2);
2012                 resop = &res.array[llndx-2];
2013         }
2014 
2015         ASSERT(resop->resop == OP_GETFH);
2016         tmpfhp = &resop->nfs_resop4_u.opgetfh.object;
2017 
2018         /* save the filehandles for the replica */
2019         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
2020         ASSERT(tmpfhp->nfs_fh4_len <= NFS4_FHSIZE);
2021         svp->sv_pfhandle.fh_len = tmpfhp->nfs_fh4_len;
2022         bcopy(tmpfhp->nfs_fh4_val, svp->sv_pfhandle.fh_buf,
2023             tmpfhp->nfs_fh4_len);
2024         ASSERT(resfhp->nfs_fh4_len <= NFS4_FHSIZE);
2025         svp->sv_fhandle.fh_len = resfhp->nfs_fh4_len;
2026         bcopy(resfhp->nfs_fh4_val, svp->sv_fhandle.fh_buf, resfhp->nfs_fh4_len);
2027 
2028         /* initialize fsid and supp_attrs for server fs */
2029         svp->sv_fsid = garp->n4g_fsid;
2030         svp->sv_supp_attrs =
2031             garp->n4g_ext_res->n4g_suppattrs | FATTR4_MANDATTR_MASK;
2032 
2033         nfs_rw_exit(&svp->sv_lock);
2034         nfs4args_lookup_free(argop, num_argops);
2035         kmem_free(argop, lookuparg.arglen * sizeof (nfs_argop4));
2036         (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2037         if (!recovery)
2038                 nfs4_end_fop(mi, NULL, NULL, OH_MOUNT, &recov_state, needrecov);
2039 }
2040 
2041 /*
2042  * Save a copy of Servinfo4_t structure.
2043  * We might need when there is a failure in getting file handle
2044  * in case of a referral to replace servinfo4 struct and try again.
2045  */
2046 static struct servinfo4 *
2047 copy_svp(servinfo4_t *nsvp)
2048 {
2049         servinfo4_t *svp = NULL;
2050         struct knetconfig *sknconf, *tknconf;
2051         struct netbuf *saddr, *taddr;
2052 
2053         svp = kmem_zalloc(sizeof (*svp), KM_SLEEP);
2054         nfs_rw_init(&svp->sv_lock, NULL, RW_DEFAULT, NULL);
2055         svp->sv_flags = nsvp->sv_flags;
2056         svp->sv_fsid = nsvp->sv_fsid;
2057         svp->sv_hostnamelen = nsvp->sv_hostnamelen;
2058         svp->sv_pathlen = nsvp->sv_pathlen;
2059         svp->sv_supp_attrs = nsvp->sv_supp_attrs;
2060 
2061         svp->sv_path = kmem_alloc(svp->sv_pathlen, KM_SLEEP);
2062         svp->sv_hostname = kmem_alloc(svp->sv_hostnamelen, KM_SLEEP);
2063         bcopy(nsvp->sv_hostname, svp->sv_hostname, svp->sv_hostnamelen);
2064         bcopy(nsvp->sv_path, svp->sv_path, svp->sv_pathlen);
2065 
2066         saddr = &nsvp->sv_addr;
2067         taddr = &svp->sv_addr;
2068         taddr->maxlen = saddr->maxlen;
2069         taddr->len = saddr->len;
2070         if (saddr->len > 0) {
2071                 taddr->buf = kmem_zalloc(saddr->maxlen, KM_SLEEP);
2072                 bcopy(saddr->buf, taddr->buf, saddr->len);
2073         }
2074 
2075         svp->sv_knconf = kmem_zalloc(sizeof (struct knetconfig), KM_SLEEP);
2076         sknconf = nsvp->sv_knconf;
2077         tknconf = svp->sv_knconf;
2078         tknconf->knc_semantics = sknconf->knc_semantics;
2079         tknconf->knc_rdev = sknconf->knc_rdev;
2080         if (sknconf->knc_proto != NULL) {
2081                 tknconf->knc_proto = kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
2082                 bcopy(sknconf->knc_proto, (char *)tknconf->knc_proto,
2083                     KNC_STRSIZE);
2084         }
2085         if (sknconf->knc_protofmly != NULL) {
2086                 tknconf->knc_protofmly = kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
2087                 bcopy(sknconf->knc_protofmly, (char *)tknconf->knc_protofmly,
2088                     KNC_STRSIZE);
2089         }
2090 
2091         if (nsvp->sv_origknconf != NULL) {
2092                 svp->sv_origknconf = kmem_zalloc(sizeof (struct knetconfig),
2093                     KM_SLEEP);
2094                 sknconf = nsvp->sv_origknconf;
2095                 tknconf = svp->sv_origknconf;
2096                 tknconf->knc_semantics = sknconf->knc_semantics;
2097                 tknconf->knc_rdev = sknconf->knc_rdev;
2098                 if (sknconf->knc_proto != NULL) {
2099                         tknconf->knc_proto = kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
2100                         bcopy(sknconf->knc_proto, (char *)tknconf->knc_proto,
2101                             KNC_STRSIZE);
2102                 }
2103                 if (sknconf->knc_protofmly != NULL) {
2104                         tknconf->knc_protofmly = kmem_zalloc(KNC_STRSIZE,
2105                             KM_SLEEP);
2106                         bcopy(sknconf->knc_protofmly,
2107                             (char *)tknconf->knc_protofmly, KNC_STRSIZE);
2108                 }
2109         }
2110 
2111         svp->sv_secdata = copy_sec_data(nsvp->sv_secdata);
2112         svp->sv_dhsec = copy_sec_data(svp->sv_dhsec);
2113         /*
2114          * Rest of the security information is not copied as they are built
2115          * with the information available from secdata and dhsec.
2116          */
2117         svp->sv_next = NULL;
2118 
2119         return (svp);
2120 }
2121 
2122 servinfo4_t *
2123 restore_svp(mntinfo4_t *mi, servinfo4_t *svp, servinfo4_t *origsvp)
2124 {
2125         servinfo4_t *srvnext, *tmpsrv;
2126 
2127         if (strcmp(svp->sv_hostname, origsvp->sv_hostname) != 0) {
2128                 /*
2129                  * Since the hostname changed, we must be dealing
2130                  * with a referral, and the lookup failed.  We will
2131                  * restore the whole servinfo4_t to what it was before.
2132                  */
2133                 srvnext = svp->sv_next;
2134                 svp->sv_next = NULL;
2135                 tmpsrv = copy_svp(origsvp);
2136                 sv4_free(svp);
2137                 svp = tmpsrv;
2138                 svp->sv_next = srvnext;
2139                 mutex_enter(&mi->mi_lock);
2140                 mi->mi_servers = svp;
2141                 mi->mi_curr_serv = svp;
2142                 mutex_exit(&mi->mi_lock);
2143 
2144         } else if (origsvp->sv_pathlen != svp->sv_pathlen) {
2145 
2146                 /*
2147                  * For symlink case: restore original path because
2148                  * it might have contained symlinks that were
2149                  * expanded by nfsgetfh_otw before the failure occurred.
2150                  */
2151                 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
2152                 kmem_free(svp->sv_path, svp->sv_pathlen);
2153                 svp->sv_path =
2154                     kmem_alloc(origsvp->sv_pathlen, KM_SLEEP);
2155                 svp->sv_pathlen = origsvp->sv_pathlen;
2156                 bcopy(origsvp->sv_path, svp->sv_path,
2157                     origsvp->sv_pathlen);
2158                 nfs_rw_exit(&svp->sv_lock);
2159         }
2160         return (svp);
2161 }
2162 
2163 static ushort_t nfs4_max_threads = 8;   /* max number of active async threads */
2164 uint_t nfs4_bsize = 32 * 1024;  /* client `block' size */
2165 static uint_t nfs4_async_clusters = 1;  /* # of reqs from each async queue */
2166 static uint_t nfs4_cots_timeo = NFS_COTS_TIMEO;
2167 
2168 /*
2169  * Remap the root filehandle for the given filesystem.
2170  *
2171  * results returned via the nfs4_error_t parameter.
2172  */
2173 void
2174 nfs4_remap_root(mntinfo4_t *mi, nfs4_error_t *ep, int flags)
2175 {
2176         struct servinfo4 *svp, *origsvp;
2177         vtype_t vtype;
2178         nfs_fh4 rootfh;
2179         int getfh_flags;
2180         int num_retry;
2181 
2182         mutex_enter(&mi->mi_lock);
2183 
2184 remap_retry:
2185         svp = mi->mi_curr_serv;
2186         getfh_flags =
2187             (flags & NFS4_REMAP_NEEDSOP) ? NFS4_GETFH_NEEDSOP : 0;
2188         getfh_flags |=
2189             (mi->mi_flags & MI4_PUBLIC) ? NFS4_GETFH_PUBLIC : 0;
2190         mutex_exit(&mi->mi_lock);
2191 
2192         /*
2193          * Just in case server path being mounted contains
2194          * symlinks and fails w/STALE, save the initial sv_path
2195          * so we can redrive the initial mount compound with the
2196          * initial sv_path -- not a symlink-expanded version.
2197          *
2198          * This could only happen if a symlink was expanded
2199          * and the expanded mount compound failed stale.  Because
2200          * it could be the case that the symlink was removed at
2201          * the server (and replaced with another symlink/dir,
2202          * we need to use the initial sv_path when attempting
2203          * to re-lookup everything and recover.
2204          */
2205         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
2206         origsvp = copy_svp(svp);
2207         nfs_rw_exit(&svp->sv_lock);
2208 
2209         num_retry = nfs4_max_mount_retry;
2210 
2211         do {
2212                 /*
2213                  * Get the root fh from the server.  Retry nfs4_max_mount_retry
2214                  * (2) times if it fails with STALE since the recovery
2215                  * infrastructure doesn't do STALE recovery for components
2216                  * of the server path to the object being mounted.
2217                  */
2218                 nfs4getfh_otw(mi, svp, &vtype, getfh_flags, CRED(), ep);
2219 
2220                 if (ep->error == 0 && ep->stat == NFS4_OK)
2221                         break;
2222 
2223                 /*
2224                  * For some reason, the mount compound failed.  Before
2225                  * retrying, we need to restore original conditions.
2226                  */
2227                 svp = restore_svp(mi, svp, origsvp);
2228 
2229         } while (num_retry-- > 0);
2230 
2231         sv4_free(origsvp);
2232 
2233         if (ep->error != 0 || ep->stat != 0) {
2234                 return;
2235         }
2236 
2237         if (vtype != VNON && vtype != mi->mi_type) {
2238                 /* shouldn't happen */
2239                 zcmn_err(mi->mi_zone->zone_id, CE_WARN,
2240                     "nfs4_remap_root: server root vnode type (%d) doesn't "
2241                     "match mount info (%d)", vtype, mi->mi_type);
2242         }
2243 
2244         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
2245         rootfh.nfs_fh4_val = svp->sv_fhandle.fh_buf;
2246         rootfh.nfs_fh4_len = svp->sv_fhandle.fh_len;
2247         nfs_rw_exit(&svp->sv_lock);
2248         sfh4_update(mi->mi_rootfh, &rootfh);
2249 
2250         /*
2251          * It's possible that recovery took place on the filesystem
2252          * and the server has been updated between the time we did
2253          * the nfs4getfh_otw and now. Re-drive the otw operation
2254          * to make sure we have a good fh.
2255          */
2256         mutex_enter(&mi->mi_lock);
2257         if (mi->mi_curr_serv != svp)
2258                 goto remap_retry;
2259 
2260         mutex_exit(&mi->mi_lock);
2261 }
2262 
2263 static int
2264 nfs4rootvp(vnode_t **rtvpp, vfs_t *vfsp, struct servinfo4 *svp_head,
2265     int flags, cred_t *cr, zone_t *zone)
2266 {
2267         vnode_t *rtvp = NULL;
2268         mntinfo4_t *mi;
2269         dev_t nfs_dev;
2270         int error = 0;
2271         rnode4_t *rp;
2272         int i, len;
2273         struct vattr va;
2274         vtype_t vtype = VNON;
2275         vtype_t tmp_vtype = VNON;
2276         struct servinfo4 *firstsvp = NULL, *svp = svp_head;
2277         nfs4_oo_hash_bucket_t *bucketp;
2278         nfs_fh4 fh;
2279         char *droptext = "";
2280         struct nfs_stats *nfsstatsp;
2281         nfs4_fname_t *mfname;
2282         nfs4_error_t e;
2283         int num_retry, removed;
2284         cred_t *lcr = NULL, *tcr = cr;
2285         struct servinfo4 *origsvp;
2286         char *resource;
2287 
2288         nfsstatsp = zone_getspecific(nfsstat_zone_key, nfs_zone());
2289         ASSERT(nfsstatsp != NULL);
2290 
2291         ASSERT(nfs_zone() == zone);
2292         ASSERT(crgetref(cr));
2293 
2294         /*
2295          * Create a mount record and link it to the vfs struct.
2296          */
2297         mi = kmem_zalloc(sizeof (*mi), KM_SLEEP);
2298         mutex_init(&mi->mi_lock, NULL, MUTEX_DEFAULT, NULL);
2299         nfs_rw_init(&mi->mi_recovlock, NULL, RW_DEFAULT, NULL);
2300         nfs_rw_init(&mi->mi_rename_lock, NULL, RW_DEFAULT, NULL);
2301         nfs_rw_init(&mi->mi_fh_lock, NULL, RW_DEFAULT, NULL);
2302 
2303         if (!(flags & NFSMNT_SOFT))
2304                 mi->mi_flags |= MI4_HARD;
2305         if ((flags & NFSMNT_NOPRINT))
2306                 mi->mi_flags |= MI4_NOPRINT;
2307         if (flags & NFSMNT_INT)
2308                 mi->mi_flags |= MI4_INT;
2309         if (flags & NFSMNT_PUBLIC)
2310                 mi->mi_flags |= MI4_PUBLIC;
2311         if (flags & NFSMNT_MIRRORMOUNT)
2312                 mi->mi_flags |= MI4_MIRRORMOUNT;
2313         if (flags & NFSMNT_REFERRAL)
2314                 mi->mi_flags |= MI4_REFERRAL;
2315         mi->mi_retrans = NFS_RETRIES;
2316         if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
2317             svp->sv_knconf->knc_semantics == NC_TPI_COTS)
2318                 mi->mi_timeo = nfs4_cots_timeo;
2319         else
2320                 mi->mi_timeo = NFS_TIMEO;
2321         mi->mi_prog = NFS_PROGRAM;
2322         mi->mi_vers = NFS_V4;
2323         mi->mi_rfsnames = rfsnames_v4;
2324         mi->mi_reqs = nfsstatsp->nfs_stats_v4.rfsreqcnt_ptr;
2325         cv_init(&mi->mi_failover_cv, NULL, CV_DEFAULT, NULL);
2326         mi->mi_servers = svp;
2327         mi->mi_curr_serv = svp;
2328         mi->mi_acregmin = SEC2HR(ACREGMIN);
2329         mi->mi_acregmax = SEC2HR(ACREGMAX);
2330         mi->mi_acdirmin = SEC2HR(ACDIRMIN);
2331         mi->mi_acdirmax = SEC2HR(ACDIRMAX);
2332         mi->mi_fh_expire_type = FH4_PERSISTENT;
2333         mi->mi_clientid_next = NULL;
2334         mi->mi_clientid_prev = NULL;
2335         mi->mi_srv = NULL;
2336         mi->mi_grace_wait = 0;
2337         mi->mi_error = 0;
2338         mi->mi_srvsettime = 0;
2339         mi->mi_srvset_cnt = 0;
2340 
2341         mi->mi_count = 1;
2342 
2343         mi->mi_tsize = nfs4_tsize(svp->sv_knconf);
2344         mi->mi_stsize = mi->mi_tsize;
2345 
2346         if (flags & NFSMNT_DIRECTIO)
2347                 mi->mi_flags |= MI4_DIRECTIO;
2348 
2349         mi->mi_flags |= MI4_MOUNTING;
2350 
2351         /*
2352          * Make a vfs struct for nfs.  We do this here instead of below
2353          * because rtvp needs a vfs before we can do a getattr on it.
2354          *
2355          * Assign a unique device id to the mount
2356          */
2357         mutex_enter(&nfs_minor_lock);
2358         do {
2359                 nfs_minor = (nfs_minor + 1) & MAXMIN32;
2360                 nfs_dev = makedevice(nfs_major, nfs_minor);
2361         } while (vfs_devismounted(nfs_dev));
2362         mutex_exit(&nfs_minor_lock);
2363 
2364         vfsp->vfs_dev = nfs_dev;
2365         vfs_make_fsid(&vfsp->vfs_fsid, nfs_dev, nfs4fstyp);
2366         vfsp->vfs_data = (caddr_t)mi;
2367         vfsp->vfs_fstype = nfsfstyp;
2368         vfsp->vfs_bsize = nfs4_bsize;
2369 
2370         /*
2371          * Initialize fields used to support async putpage operations.
2372          */
2373         for (i = 0; i < NFS4_ASYNC_TYPES; i++)
2374                 mi->mi_async_clusters[i] = nfs4_async_clusters;
2375         mi->mi_async_init_clusters = nfs4_async_clusters;
2376         mi->mi_async_curr[NFS4_ASYNC_QUEUE] =
2377             mi->mi_async_curr[NFS4_ASYNC_PGOPS_QUEUE] = &mi->mi_async_reqs[0];
2378         mi->mi_max_threads = nfs4_max_threads;
2379         mutex_init(&mi->mi_async_lock, NULL, MUTEX_DEFAULT, NULL);
2380         cv_init(&mi->mi_async_reqs_cv, NULL, CV_DEFAULT, NULL);
2381         cv_init(&mi->mi_async_work_cv[NFS4_ASYNC_QUEUE], NULL, CV_DEFAULT,
2382             NULL);
2383         cv_init(&mi->mi_async_work_cv[NFS4_ASYNC_PGOPS_QUEUE], NULL,
2384             CV_DEFAULT, NULL);
2385         cv_init(&mi->mi_async_cv, NULL, CV_DEFAULT, NULL);
2386         cv_init(&mi->mi_inact_req_cv, NULL, CV_DEFAULT, NULL);
2387 
2388         mi->mi_vfsp = vfsp;
2389         mi->mi_zone = zone;
2390         zone_init_ref(&mi->mi_zone_ref);
2391         zone_hold_ref(zone, &mi->mi_zone_ref, ZONE_REF_NFSV4);
2392         nfs4_mi_zonelist_add(mi);
2393 
2394         /*
2395          * Initialize the <open owner/cred> hash table.
2396          */
2397         for (i = 0; i < NFS4_NUM_OO_BUCKETS; i++) {
2398                 bucketp = &(mi->mi_oo_list[i]);
2399                 mutex_init(&bucketp->b_lock, NULL, MUTEX_DEFAULT, NULL);
2400                 list_create(&bucketp->b_oo_hash_list,
2401                     sizeof (nfs4_open_owner_t),
2402                     offsetof(nfs4_open_owner_t, oo_hash_node));
2403         }
2404 
2405         /*
2406          * Initialize the freed open owner list.
2407          */
2408         mi->mi_foo_num = 0;
2409         mi->mi_foo_max = NFS4_NUM_FREED_OPEN_OWNERS;
2410         list_create(&mi->mi_foo_list, sizeof (nfs4_open_owner_t),
2411             offsetof(nfs4_open_owner_t, oo_foo_node));
2412 
2413         list_create(&mi->mi_lost_state, sizeof (nfs4_lost_rqst_t),
2414             offsetof(nfs4_lost_rqst_t, lr_node));
2415 
2416         list_create(&mi->mi_bseqid_list, sizeof (nfs4_bseqid_entry_t),
2417             offsetof(nfs4_bseqid_entry_t, bs_node));
2418 
2419         /*
2420          * Initialize the msg buffer.
2421          */
2422         list_create(&mi->mi_msg_list, sizeof (nfs4_debug_msg_t),
2423             offsetof(nfs4_debug_msg_t, msg_node));
2424         mi->mi_msg_count = 0;
2425         mutex_init(&mi->mi_msg_list_lock, NULL, MUTEX_DEFAULT, NULL);
2426 
2427         /*
2428          * Initialize kstats
2429          */
2430         nfs4_mnt_kstat_init(vfsp);
2431 
2432         /*
2433          * Initialize the shared filehandle pool.
2434          */
2435         sfh4_createtab(&mi->mi_filehandles);
2436 
2437         /*
2438          * Save server path we're attempting to mount.
2439          */
2440         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
2441         origsvp = copy_svp(svp);
2442         nfs_rw_exit(&svp->sv_lock);
2443 
2444         /*
2445          * Make the GETFH call to get root fh for each replica.
2446          */
2447         if (svp_head->sv_next)
2448                 droptext = ", dropping replica";
2449 
2450         /*
2451          * If the uid is set then set the creds for secure mounts
2452          * by proxy processes such as automountd.
2453          */
2454         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
2455         if (svp->sv_secdata->uid != 0 &&
2456             svp->sv_secdata->rpcflavor == RPCSEC_GSS) {
2457                 lcr = crdup(cr);
2458                 (void) crsetugid(lcr, svp->sv_secdata->uid, crgetgid(cr));
2459                 tcr = lcr;
2460         }
2461         nfs_rw_exit(&svp->sv_lock);
2462         for (svp = svp_head; svp; svp = svp->sv_next) {
2463                 if (nfs4_chkdup_servinfo4(svp_head, svp)) {
2464                         nfs_cmn_err(error, CE_WARN,
2465                             VERS_MSG "Host %s is a duplicate%s",
2466                             svp->sv_hostname, droptext);
2467                         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
2468                         svp->sv_flags |= SV4_NOTINUSE;
2469                         nfs_rw_exit(&svp->sv_lock);
2470                         continue;
2471                 }
2472                 mi->mi_curr_serv = svp;
2473 
2474                 /*
2475                  * Just in case server path being mounted contains
2476                  * symlinks and fails w/STALE, save the initial sv_path
2477                  * so we can redrive the initial mount compound with the
2478                  * initial sv_path -- not a symlink-expanded version.
2479                  *
2480                  * This could only happen if a symlink was expanded
2481                  * and the expanded mount compound failed stale.  Because
2482                  * it could be the case that the symlink was removed at
2483                  * the server (and replaced with another symlink/dir,
2484                  * we need to use the initial sv_path when attempting
2485                  * to re-lookup everything and recover.
2486                  *
2487                  * Other mount errors should evenutally be handled here also
2488                  * (NFS4ERR_DELAY, NFS4ERR_RESOURCE).  For now, all mount
2489                  * failures will result in mount being redriven a few times.
2490                  */
2491                 num_retry = nfs4_max_mount_retry;
2492                 do {
2493                         nfs4getfh_otw(mi, svp, &tmp_vtype,
2494                             ((flags & NFSMNT_PUBLIC) ? NFS4_GETFH_PUBLIC : 0) |
2495                             NFS4_GETFH_NEEDSOP, tcr, &e);
2496 
2497                         if (e.error == 0 && e.stat == NFS4_OK)
2498                                 break;
2499 
2500                         /*
2501                          * For some reason, the mount compound failed.  Before
2502                          * retrying, we need to restore original conditions.
2503                          */
2504                         svp = restore_svp(mi, svp, origsvp);
2505                         svp_head = svp;
2506 
2507                 } while (num_retry-- > 0);
2508                 error = e.error ? e.error : geterrno4(e.stat);
2509                 if (error) {
2510                         nfs_cmn_err(error, CE_WARN,
2511                             VERS_MSG "initial call to %s failed%s: %m",
2512                             svp->sv_hostname, droptext);
2513                         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
2514                         svp->sv_flags |= SV4_NOTINUSE;
2515                         nfs_rw_exit(&svp->sv_lock);
2516                         mi->mi_flags &= ~MI4_RECOV_FAIL;
2517                         mi->mi_error = 0;
2518                         continue;
2519                 }
2520 
2521                 if (tmp_vtype == VBAD) {
2522                         zcmn_err(mi->mi_zone->zone_id, CE_WARN,
2523                             VERS_MSG "%s returned a bad file type for "
2524                             "root%s", svp->sv_hostname, droptext);
2525                         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
2526                         svp->sv_flags |= SV4_NOTINUSE;
2527                         nfs_rw_exit(&svp->sv_lock);
2528                         continue;
2529                 }
2530 
2531                 if (vtype == VNON) {
2532                         vtype = tmp_vtype;
2533                 } else if (vtype != tmp_vtype) {
2534                         zcmn_err(mi->mi_zone->zone_id, CE_WARN,
2535                             VERS_MSG "%s returned a different file type "
2536                             "for root%s", svp->sv_hostname, droptext);
2537                         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
2538                         svp->sv_flags |= SV4_NOTINUSE;
2539                         nfs_rw_exit(&svp->sv_lock);
2540                         continue;
2541                 }
2542                 if (firstsvp == NULL)
2543                         firstsvp = svp;
2544         }
2545 
2546         if (firstsvp == NULL) {
2547                 if (error == 0)
2548                         error = ENOENT;
2549                 goto bad;
2550         }
2551 
2552         mi->mi_curr_serv = svp = firstsvp;
2553         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
2554         ASSERT((mi->mi_curr_serv->sv_flags & SV4_NOTINUSE) == 0);
2555         fh.nfs_fh4_len = svp->sv_fhandle.fh_len;
2556         fh.nfs_fh4_val = svp->sv_fhandle.fh_buf;
2557         mi->mi_rootfh = sfh4_get(&fh, mi);
2558         fh.nfs_fh4_len = svp->sv_pfhandle.fh_len;
2559         fh.nfs_fh4_val = svp->sv_pfhandle.fh_buf;
2560         mi->mi_srvparentfh = sfh4_get(&fh, mi);
2561         nfs_rw_exit(&svp->sv_lock);
2562 
2563         /*
2564          * Get the fname for filesystem root.
2565          */
2566         mi->mi_fname = fn_get(NULL, ".", mi->mi_rootfh);
2567         mfname = mi->mi_fname;
2568         fn_hold(mfname);
2569 
2570         /*
2571          * Make the root vnode without attributes.
2572          */
2573         rtvp = makenfs4node_by_fh(mi->mi_rootfh, NULL,
2574             &mfname, NULL, mi, cr, gethrtime());
2575         rtvp->v_type = vtype;
2576 
2577         mi->mi_curread = mi->mi_tsize;
2578         mi->mi_curwrite = mi->mi_stsize;
2579 
2580         /*
2581          * Start the manager thread responsible for handling async worker
2582          * threads.
2583          */
2584         MI4_HOLD(mi);
2585         VFS_HOLD(vfsp); /* add reference for thread */
2586         mi->mi_manager_thread = zthread_create(NULL, 0, nfs4_async_manager,
2587             vfsp, 0, minclsyspri);
2588         ASSERT(mi->mi_manager_thread != NULL);
2589 
2590         /*
2591          * Create the thread that handles over-the-wire calls for
2592          * VOP_INACTIVE.
2593          * This needs to happen after the manager thread is created.
2594          */
2595         MI4_HOLD(mi);
2596         mi->mi_inactive_thread = zthread_create(NULL, 0, nfs4_inactive_thread,
2597             mi, 0, minclsyspri);
2598         ASSERT(mi->mi_inactive_thread != NULL);
2599 
2600         /* If we didn't get a type, get one now */
2601         if (rtvp->v_type == VNON) {
2602                 va.va_mask = AT_TYPE;
2603                 error = nfs4getattr(rtvp, &va, tcr);
2604                 if (error)
2605                         goto bad;
2606                 rtvp->v_type = va.va_type;
2607         }
2608 
2609         mi->mi_type = rtvp->v_type;
2610 
2611         mutex_enter(&mi->mi_lock);
2612         mi->mi_flags &= ~MI4_MOUNTING;
2613         mutex_exit(&mi->mi_lock);
2614 
2615         /* Update VFS with new server and path info */
2616         if ((strcmp(svp->sv_hostname, origsvp->sv_hostname) != 0) ||
2617             (strcmp(svp->sv_path, origsvp->sv_path) != 0)) {
2618                 len = svp->sv_hostnamelen + svp->sv_pathlen;
2619                 resource = kmem_zalloc(len, KM_SLEEP);
2620                 (void) strcat(resource, svp->sv_hostname);
2621                 (void) strcat(resource, ":");
2622                 (void) strcat(resource, svp->sv_path);
2623                 vfs_setresource(vfsp, resource, 0);
2624                 kmem_free(resource, len);
2625         }
2626 
2627         sv4_free(origsvp);
2628         *rtvpp = rtvp;
2629         if (lcr != NULL)
2630                 crfree(lcr);
2631 
2632         return (0);
2633 bad:
2634         /*
2635          * An error occurred somewhere, need to clean up...
2636          */
2637         if (lcr != NULL)
2638                 crfree(lcr);
2639 
2640         if (rtvp != NULL) {
2641                 /*
2642                  * We need to release our reference to the root vnode and
2643                  * destroy the mntinfo4 struct that we just created.
2644                  */
2645                 rp = VTOR4(rtvp);
2646                 if (rp->r_flags & R4HASHED)
2647                         rp4_rmhash(rp);
2648                 VN_RELE(rtvp);
2649         }
2650         nfs4_async_stop(vfsp);
2651         nfs4_async_manager_stop(vfsp);
2652         removed = nfs4_mi_zonelist_remove(mi);
2653         if (removed)
2654                 zone_rele_ref(&mi->mi_zone_ref, ZONE_REF_NFSV4);
2655 
2656         /*
2657          * This releases the initial "hold" of the mi since it will never
2658          * be referenced by the vfsp.  Also, when mount returns to vfs.c
2659          * with an error, the vfsp will be destroyed, not rele'd.
2660          */
2661         MI4_RELE(mi);
2662 
2663         if (origsvp != NULL)
2664                 sv4_free(origsvp);
2665 
2666         *rtvpp = NULL;
2667         return (error);
2668 }
2669 
2670 /*
2671  * vfs operations
2672  */
2673 static int
2674 nfs4_unmount(vfs_t *vfsp, int flag, cred_t *cr)
2675 {
2676         mntinfo4_t              *mi;
2677         ushort_t                omax;
2678         int                     removed;
2679 
2680         bool_t                  must_unlock;
2681 
2682         nfs4_ephemeral_tree_t   *eph_tree;
2683 
2684         if (secpolicy_fs_unmount(cr, vfsp) != 0)
2685                 return (EPERM);
2686 
2687         mi = VFTOMI4(vfsp);
2688 
2689         if (flag & MS_FORCE) {
2690                 vfsp->vfs_flag |= VFS_UNMOUNTED;
2691                 if (nfs_zone() != mi->mi_zone) {
2692                         /*
2693                          * If the request is coming from the wrong zone,
2694                          * we don't want to create any new threads, and
2695                          * performance is not a concern.  Do everything
2696                          * inline.
2697                          */
2698                         NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2699                             "nfs4_unmount x-zone forced unmount of vfs %p\n",
2700                             (void *)vfsp));
2701                         nfs4_free_mount(vfsp, flag, cr);
2702                 } else {
2703                         /*
2704                          * Free data structures asynchronously, to avoid
2705                          * blocking the current thread (for performance
2706                          * reasons only).
2707                          */
2708                         async_free_mount(vfsp, flag, cr);
2709                 }
2710 
2711                 return (0);
2712         }
2713 
2714         /*
2715          * Wait until all asynchronous putpage operations on
2716          * this file system are complete before flushing rnodes
2717          * from the cache.
2718          */
2719         omax = mi->mi_max_threads;
2720         if (nfs4_async_stop_sig(vfsp))
2721                 return (EINTR);
2722 
2723         r4flush(vfsp, cr);
2724 
2725         /*
2726          * About the only reason that this would fail would be
2727          * that the harvester is already busy tearing down this
2728          * node. So we fail back to the caller and let them try
2729          * again when needed.
2730          */
2731         if (nfs4_ephemeral_umount(mi, flag, cr,
2732             &must_unlock, &eph_tree)) {
2733                 ASSERT(must_unlock == FALSE);
2734                 mutex_enter(&mi->mi_async_lock);
2735                 mi->mi_max_threads = omax;
2736                 mutex_exit(&mi->mi_async_lock);
2737 
2738                 return (EBUSY);
2739         }
2740 
2741         /*
2742          * If there are any active vnodes on this file system,
2743          * then the file system is busy and can't be unmounted.
2744          */
2745         if (check_rtable4(vfsp)) {
2746                 nfs4_ephemeral_umount_unlock(&must_unlock, &eph_tree);
2747 
2748                 mutex_enter(&mi->mi_async_lock);
2749                 mi->mi_max_threads = omax;
2750                 mutex_exit(&mi->mi_async_lock);
2751 
2752                 return (EBUSY);
2753         }
2754 
2755         /*
2756          * The unmount can't fail from now on, so record any
2757          * ephemeral changes.
2758          */
2759         nfs4_ephemeral_umount_activate(mi, &must_unlock, &eph_tree);
2760 
2761         /*
2762          * There are no active files that could require over-the-wire
2763          * calls to the server, so stop the async manager and the
2764          * inactive thread.
2765          */
2766         nfs4_async_manager_stop(vfsp);
2767 
2768         /*
2769          * Destroy all rnodes belonging to this file system from the
2770          * rnode hash queues and purge any resources allocated to
2771          * them.
2772          */
2773         destroy_rtable4(vfsp, cr);
2774         vfsp->vfs_flag |= VFS_UNMOUNTED;
2775 
2776         nfs4_remove_mi_from_server(mi, NULL);
2777         removed = nfs4_mi_zonelist_remove(mi);
2778         if (removed)
2779                 zone_rele_ref(&mi->mi_zone_ref, ZONE_REF_NFSV4);
2780 
2781         return (0);
2782 }
2783 
2784 /*
2785  * find root of nfs
2786  */
2787 static int
2788 nfs4_root(vfs_t *vfsp, vnode_t **vpp)
2789 {
2790         mntinfo4_t *mi;
2791         vnode_t *vp;
2792         nfs4_fname_t *mfname;
2793         servinfo4_t *svp;
2794 
2795         mi = VFTOMI4(vfsp);
2796 
2797         if (nfs_zone() != mi->mi_zone)
2798                 return (EPERM);
2799 
2800         svp = mi->mi_curr_serv;
2801         if (svp) {
2802                 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
2803                 if (svp->sv_flags & SV4_ROOT_STALE) {
2804                         nfs_rw_exit(&svp->sv_lock);
2805 
2806                         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
2807                         if (svp->sv_flags & SV4_ROOT_STALE) {
2808                                 svp->sv_flags &= ~SV4_ROOT_STALE;
2809                                 nfs_rw_exit(&svp->sv_lock);
2810                                 return (ENOENT);
2811                         }
2812                         nfs_rw_exit(&svp->sv_lock);
2813                 } else
2814                         nfs_rw_exit(&svp->sv_lock);
2815         }
2816 
2817         mfname = mi->mi_fname;
2818         fn_hold(mfname);
2819         vp = makenfs4node_by_fh(mi->mi_rootfh, NULL, &mfname, NULL,
2820             VFTOMI4(vfsp), CRED(), gethrtime());
2821 
2822         if (VTOR4(vp)->r_flags & R4STALE) {
2823                 VN_RELE(vp);
2824                 return (ENOENT);
2825         }
2826 
2827         ASSERT(vp->v_type == VNON || vp->v_type == mi->mi_type);
2828 
2829         vp->v_type = mi->mi_type;
2830 
2831         *vpp = vp;
2832 
2833         return (0);
2834 }
2835 
2836 static int
2837 nfs4_statfs_otw(vnode_t *vp, struct statvfs64 *sbp, cred_t *cr)
2838 {
2839         int error;
2840         nfs4_ga_res_t gar;
2841         nfs4_ga_ext_res_t ger;
2842 
2843         gar.n4g_ext_res = &ger;
2844 
2845         if (error = nfs4_attr_otw(vp, TAG_FSINFO, &gar,
2846             NFS4_STATFS_ATTR_MASK, cr))
2847                 return (error);
2848 
2849         *sbp = gar.n4g_ext_res->n4g_sb;
2850 
2851         return (0);
2852 }
2853 
2854 /*
2855  * Get file system statistics.
2856  */
2857 static int
2858 nfs4_statvfs(vfs_t *vfsp, struct statvfs64 *sbp)
2859 {
2860         int error;
2861         vnode_t *vp;
2862         cred_t *cr;
2863 
2864         error = nfs4_root(vfsp, &vp);
2865         if (error)
2866                 return (error);
2867 
2868         cr = CRED();
2869 
2870         error = nfs4_statfs_otw(vp, sbp, cr);
2871         if (!error) {
2872                 (void) strncpy(sbp->f_basetype,
2873                     vfssw[vfsp->vfs_fstype].vsw_name, FSTYPSZ);
2874                 sbp->f_flag = vf_to_stf(vfsp->vfs_flag);
2875         } else {
2876                 nfs4_purge_stale_fh(error, vp, cr);
2877         }
2878 
2879         VN_RELE(vp);
2880 
2881         return (error);
2882 }
2883 
2884 static kmutex_t nfs4_syncbusy;
2885 
2886 /*
2887  * Flush dirty nfs files for file system vfsp.
2888  * If vfsp == NULL, all nfs files are flushed.
2889  *
2890  * SYNC_CLOSE in flag is passed to us to
2891  * indicate that we are shutting down and or
2892  * rebooting.
2893  */
2894 static int
2895 nfs4_sync(vfs_t *vfsp, short flag, cred_t *cr)
2896 {
2897         /*
2898          * Cross-zone calls are OK here, since this translates to a
2899          * VOP_PUTPAGE(B_ASYNC), which gets picked up by the right zone.
2900          */
2901         if (!(flag & SYNC_ATTR) && mutex_tryenter(&nfs4_syncbusy) != 0) {
2902                 r4flush(vfsp, cr);
2903                 mutex_exit(&nfs4_syncbusy);
2904         }
2905 
2906         /*
2907          * if SYNC_CLOSE is set then we know that
2908          * the system is rebooting, mark the mntinfo
2909          * for later examination.
2910          */
2911         if (vfsp && (flag & SYNC_CLOSE)) {
2912                 mntinfo4_t *mi;
2913 
2914                 mi = VFTOMI4(vfsp);
2915                 if (!(mi->mi_flags & MI4_SHUTDOWN)) {
2916                         mutex_enter(&mi->mi_lock);
2917                         mi->mi_flags |= MI4_SHUTDOWN;
2918                         mutex_exit(&mi->mi_lock);
2919                 }
2920         }
2921         return (0);
2922 }
2923 
2924 /*
2925  * vget is difficult, if not impossible, to support in v4 because we don't
2926  * know the parent directory or name, which makes it impossible to create a
2927  * useful shadow vnode.  And we need the shadow vnode for things like
2928  * OPEN.
2929  */
2930 
2931 /* ARGSUSED */
2932 /*
2933  * XXX Check nfs4_vget_pseudo() for dependency.
2934  */
2935 static int
2936 nfs4_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp)
2937 {
2938         return (EREMOTE);
2939 }
2940 
2941 /*
2942  * nfs4_mountroot get called in the case where we are diskless booting.  All
2943  * we need from here is the ability to get the server info and from there we
2944  * can simply call nfs4_rootvp.
2945  */
2946 /* ARGSUSED */
2947 static int
2948 nfs4_mountroot(vfs_t *vfsp, whymountroot_t why)
2949 {
2950         vnode_t *rtvp;
2951         char root_hostname[SYS_NMLN+1];
2952         struct servinfo4 *svp;
2953         int error;
2954         int vfsflags;
2955         size_t size;
2956         char *root_path;
2957         struct pathname pn;
2958         char *name;
2959         cred_t *cr;
2960         mntinfo4_t *mi;
2961         struct nfs_args args;           /* nfs mount arguments */
2962         static char token[10];
2963         nfs4_error_t n4e;
2964 
2965         bzero(&args, sizeof (args));
2966 
2967         /* do this BEFORE getfile which causes xid stamps to be initialized */
2968         clkset(-1L);            /* hack for now - until we get time svc? */
2969 
2970         if (why == ROOT_REMOUNT) {
2971                 /*
2972                  * Shouldn't happen.
2973                  */
2974                 panic("nfs4_mountroot: why == ROOT_REMOUNT");
2975         }
2976 
2977         if (why == ROOT_UNMOUNT) {
2978                 /*
2979                  * Nothing to do for NFS.
2980                  */
2981                 return (0);
2982         }
2983 
2984         /*
2985          * why == ROOT_INIT
2986          */
2987 
2988         name = token;
2989         *name = 0;
2990         (void) getfsname("root", name, sizeof (token));
2991 
2992         pn_alloc(&pn);
2993         root_path = pn.pn_path;
2994 
2995         svp = kmem_zalloc(sizeof (*svp), KM_SLEEP);
2996         nfs_rw_init(&svp->sv_lock, NULL, RW_DEFAULT, NULL);
2997         svp->sv_knconf = kmem_zalloc(sizeof (*svp->sv_knconf), KM_SLEEP);
2998         svp->sv_knconf->knc_protofmly = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
2999         svp->sv_knconf->knc_proto = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
3000 
3001         /*
3002          * Get server address
3003          * Get the root path
3004          * Get server's transport
3005          * Get server's hostname
3006          * Get options
3007          */
3008         args.addr = &svp->sv_addr;
3009         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
3010         args.fh = (char *)&svp->sv_fhandle;
3011         args.knconf = svp->sv_knconf;
3012         args.hostname = root_hostname;
3013         vfsflags = 0;
3014         if (error = mount_root(*name ? name : "root", root_path, NFS_V4,
3015             &args, &vfsflags)) {
3016                 if (error == EPROTONOSUPPORT)
3017                         nfs_cmn_err(error, CE_WARN, "nfs4_mountroot: "
3018                             "mount_root failed: server doesn't support NFS V4");
3019                 else
3020                         nfs_cmn_err(error, CE_WARN,
3021                             "nfs4_mountroot: mount_root failed: %m");
3022                 nfs_rw_exit(&svp->sv_lock);
3023                 sv4_free(svp);
3024                 pn_free(&pn);
3025                 return (error);
3026         }
3027         nfs_rw_exit(&svp->sv_lock);
3028         svp->sv_hostnamelen = (int)(strlen(root_hostname) + 1);
3029         svp->sv_hostname = kmem_alloc(svp->sv_hostnamelen, KM_SLEEP);
3030         (void) strcpy(svp->sv_hostname, root_hostname);
3031 
3032         svp->sv_pathlen = (int)(strlen(root_path) + 1);
3033         svp->sv_path = kmem_alloc(svp->sv_pathlen, KM_SLEEP);
3034         (void) strcpy(svp->sv_path, root_path);
3035 
3036         /*
3037          * Force root partition to always be mounted with AUTH_UNIX for now
3038          */
3039         svp->sv_secdata = kmem_alloc(sizeof (*svp->sv_secdata), KM_SLEEP);
3040         svp->sv_secdata->secmod = AUTH_UNIX;
3041         svp->sv_secdata->rpcflavor = AUTH_UNIX;
3042         svp->sv_secdata->data = NULL;
3043 
3044         cr = crgetcred();
3045         rtvp = NULL;
3046 
3047         error = nfs4rootvp(&rtvp, vfsp, svp, args.flags, cr, global_zone);
3048 
3049         if (error) {
3050                 crfree(cr);
3051                 pn_free(&pn);
3052                 sv4_free(svp);
3053                 return (error);
3054         }
3055 
3056         mi = VTOMI4(rtvp);
3057 
3058         /*
3059          * Send client id to the server, if necessary
3060          */
3061         nfs4_error_zinit(&n4e);
3062         nfs4setclientid(mi, cr, FALSE, &n4e);
3063         error = n4e.error;
3064 
3065         crfree(cr);
3066 
3067         if (error) {
3068                 pn_free(&pn);
3069                 goto errout;
3070         }
3071 
3072         error = nfs4_setopts(rtvp, DATAMODEL_NATIVE, &args);
3073         if (error) {
3074                 nfs_cmn_err(error, CE_WARN,
3075                     "nfs4_mountroot: invalid root mount options");
3076                 pn_free(&pn);
3077                 goto errout;
3078         }
3079 
3080         (void) vfs_lock_wait(vfsp);
3081         vfs_add(NULL, vfsp, vfsflags);
3082         vfs_unlock(vfsp);
3083 
3084         size = strlen(svp->sv_hostname);
3085         (void) strcpy(rootfs.bo_name, svp->sv_hostname);
3086         rootfs.bo_name[size] = ':';
3087         (void) strcpy(&rootfs.bo_name[size + 1], root_path);
3088 
3089         pn_free(&pn);
3090 
3091 errout:
3092         if (error) {
3093                 sv4_free(svp);
3094                 nfs4_async_stop(vfsp);
3095                 nfs4_async_manager_stop(vfsp);
3096         }
3097 
3098         if (rtvp != NULL)
3099                 VN_RELE(rtvp);
3100 
3101         return (error);
3102 }
3103 
3104 /*
3105  * Initialization routine for VFS routines.  Should only be called once
3106  */
3107 int
3108 nfs4_vfsinit(void)
3109 {
3110         mutex_init(&nfs4_syncbusy, NULL, MUTEX_DEFAULT, NULL);
3111         nfs4setclientid_init();
3112         nfs4_ephemeral_init();
3113         return (0);
3114 }
3115 
3116 void
3117 nfs4_vfsfini(void)
3118 {
3119         nfs4_ephemeral_fini();
3120         nfs4setclientid_fini();
3121         mutex_destroy(&nfs4_syncbusy);
3122 }
3123 
3124 void
3125 nfs4_freevfs(vfs_t *vfsp)
3126 {
3127         mntinfo4_t *mi;
3128 
3129         /* need to release the initial hold */
3130         mi = VFTOMI4(vfsp);
3131 
3132         /*
3133          * At this point, we can no longer reference the vfs
3134          * and need to inform other holders of the reference
3135          * to the mntinfo4_t.
3136          */
3137         mi->mi_vfsp = NULL;
3138 
3139         MI4_RELE(mi);
3140 }
3141 
3142 /*
3143  * Client side SETCLIENTID and SETCLIENTID_CONFIRM
3144  */
3145 struct nfs4_server nfs4_server_lst =
3146         { &nfs4_server_lst, &nfs4_server_lst };
3147 
3148 kmutex_t nfs4_server_lst_lock;
3149 
3150 static void
3151 nfs4setclientid_init(void)
3152 {
3153         mutex_init(&nfs4_server_lst_lock, NULL, MUTEX_DEFAULT, NULL);
3154 }
3155 
3156 static void
3157 nfs4setclientid_fini(void)
3158 {
3159         mutex_destroy(&nfs4_server_lst_lock);
3160 }
3161 
3162 int nfs4_retry_sclid_delay = NFS4_RETRY_SCLID_DELAY;
3163 int nfs4_num_sclid_retries = NFS4_NUM_SCLID_RETRIES;
3164 
3165 /*
3166  * Set the clientid for the server for "mi".  No-op if the clientid is
3167  * already set.
3168  *
3169  * The recovery boolean should be set to TRUE if this function was called
3170  * by the recovery code, and FALSE otherwise.  This is used to determine
3171  * if we need to call nfs4_start/end_op as well as grab the mi_recovlock
3172  * for adding a mntinfo4_t to a nfs4_server_t.
3173  *
3174  * Error is returned via 'n4ep'.  If there was a 'n4ep->stat' error, then
3175  * 'n4ep->error' is set to geterrno4(n4ep->stat).
3176  */
3177 void
3178 nfs4setclientid(mntinfo4_t *mi, cred_t *cr, bool_t recovery, nfs4_error_t *n4ep)
3179 {
3180         struct nfs4_server *np;
3181         struct servinfo4 *svp = mi->mi_curr_serv;
3182         nfs4_recov_state_t recov_state;
3183         int num_retries = 0;
3184         bool_t retry;
3185         cred_t *lcr = NULL;
3186         int retry_inuse = 1; /* only retry once on NFS4ERR_CLID_INUSE */
3187         time_t lease_time = 0;
3188 
3189         recov_state.rs_flags = 0;
3190         recov_state.rs_num_retry_despite_err = 0;
3191         ASSERT(n4ep != NULL);
3192 
3193 recov_retry:
3194         retry = FALSE;
3195         nfs4_error_zinit(n4ep);
3196         if (!recovery)
3197                 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0);
3198 
3199         mutex_enter(&nfs4_server_lst_lock);
3200         np = servinfo4_to_nfs4_server(svp); /* This locks np if it is found */
3201         mutex_exit(&nfs4_server_lst_lock);
3202         if (!np) {
3203                 struct nfs4_server *tnp;
3204                 np = new_nfs4_server(svp, cr);
3205                 mutex_enter(&np->s_lock);
3206 
3207                 mutex_enter(&nfs4_server_lst_lock);
3208                 tnp = servinfo4_to_nfs4_server(svp);
3209                 if (tnp) {
3210                         /*
3211                          * another thread snuck in and put server on list.
3212                          * since we aren't adding it to the nfs4_server_list
3213                          * we need to set the ref count to 0 and destroy it.
3214                          */
3215                         np->s_refcnt = 0;
3216                         destroy_nfs4_server(np);
3217                         np = tnp;
3218                 } else {
3219                         /*
3220                          * do not give list a reference until everything
3221                          * succeeds
3222                          */
3223                         insque(np, &nfs4_server_lst);
3224                 }
3225                 mutex_exit(&nfs4_server_lst_lock);
3226         }
3227         ASSERT(MUTEX_HELD(&np->s_lock));
3228         /*
3229          * If we find the server already has N4S_CLIENTID_SET, then
3230          * just return, we've already done SETCLIENTID to that server
3231          */
3232         if (np->s_flags & N4S_CLIENTID_SET) {
3233                 /* add mi to np's mntinfo4_list */
3234                 nfs4_add_mi_to_server(np, mi);
3235                 if (!recovery)
3236                         nfs_rw_exit(&mi->mi_recovlock);
3237                 mutex_exit(&np->s_lock);
3238                 nfs4_server_rele(np);
3239                 return;
3240         }
3241         mutex_exit(&np->s_lock);
3242 
3243 
3244         /*
3245          * Drop the mi_recovlock since nfs4_start_op will
3246          * acquire it again for us.
3247          */
3248         if (!recovery) {
3249                 nfs_rw_exit(&mi->mi_recovlock);
3250 
3251                 n4ep->error = nfs4_start_op(mi, NULL, NULL, &recov_state);
3252                 if (n4ep->error) {
3253                         nfs4_server_rele(np);
3254                         return;
3255                 }
3256         }
3257 
3258         mutex_enter(&np->s_lock);
3259         while (np->s_flags & N4S_CLIENTID_PEND) {
3260                 if (!cv_wait_sig(&np->s_clientid_pend, &np->s_lock)) {
3261                         mutex_exit(&np->s_lock);
3262                         nfs4_server_rele(np);
3263                         if (!recovery)
3264                                 nfs4_end_op(mi, NULL, NULL, &recov_state,
3265                                     recovery);
3266                         n4ep->error = EINTR;
3267                         return;
3268                 }
3269         }
3270 
3271         if (np->s_flags & N4S_CLIENTID_SET) {
3272                 /* XXX copied/pasted from above */
3273                 /* add mi to np's mntinfo4_list */
3274                 nfs4_add_mi_to_server(np, mi);
3275                 mutex_exit(&np->s_lock);
3276                 nfs4_server_rele(np);
3277                 if (!recovery)
3278                         nfs4_end_op(mi, NULL, NULL, &recov_state, recovery);
3279                 return;
3280         }
3281 
3282         /*
3283          * Reset the N4S_CB_PINGED flag. This is used to
3284          * indicate if we have received a CB_NULL from the
3285          * server. Also we reset the waiter flag.
3286          */
3287         np->s_flags &= ~(N4S_CB_PINGED | N4S_CB_WAITER);
3288         /* any failure must now clear this flag */
3289         np->s_flags |= N4S_CLIENTID_PEND;
3290         mutex_exit(&np->s_lock);
3291         nfs4setclientid_otw(mi, svp, cr, np, n4ep, &retry_inuse);
3292 
3293         if (n4ep->error == EACCES) {
3294                 /*
3295                  * If the uid is set then set the creds for secure mounts
3296                  * by proxy processes such as automountd.
3297                  */
3298                 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
3299                 if (svp->sv_secdata->uid != 0) {
3300                         lcr = crdup(cr);
3301                         (void) crsetugid(lcr, svp->sv_secdata->uid,
3302                             crgetgid(cr));
3303                 }
3304                 nfs_rw_exit(&svp->sv_lock);
3305 
3306                 if (lcr != NULL) {
3307                         mutex_enter(&np->s_lock);
3308                         crfree(np->s_cred);
3309                         np->s_cred = lcr;
3310                         mutex_exit(&np->s_lock);
3311                         nfs4setclientid_otw(mi, svp, lcr, np, n4ep,
3312                             &retry_inuse);
3313                 }
3314         }
3315         mutex_enter(&np->s_lock);
3316         lease_time = np->s_lease_time;
3317         np->s_flags &= ~N4S_CLIENTID_PEND;
3318         mutex_exit(&np->s_lock);
3319 
3320         if (n4ep->error != 0 || n4ep->stat != NFS4_OK) {
3321                 /*
3322                  * Start recovery if failover is a possibility.  If
3323                  * invoked by the recovery thread itself, then just
3324                  * return and let it handle the failover first.  NB:
3325                  * recovery is not allowed if the mount is in progress
3326                  * since the infrastructure is not sufficiently setup
3327                  * to allow it.  Just return the error (after suitable
3328                  * retries).
3329                  */
3330                 if (FAILOVER_MOUNT4(mi) && nfs4_try_failover(n4ep)) {
3331                         (void) nfs4_start_recovery(n4ep, mi, NULL,
3332                             NULL, NULL, NULL, OP_SETCLIENTID, NULL, NULL, NULL);
3333                         /*
3334                          * Don't retry here, just return and let
3335                          * recovery take over.
3336                          */
3337                         if (recovery)
3338                                 retry = FALSE;
3339                 } else if (nfs4_rpc_retry_error(n4ep->error) ||
3340                     n4ep->stat == NFS4ERR_RESOURCE ||
3341                     n4ep->stat == NFS4ERR_STALE_CLIENTID) {
3342 
3343                         retry = TRUE;
3344                         /*
3345                          * Always retry if in recovery or once had
3346                          * contact with the server (but now it's
3347                          * overloaded).
3348                          */
3349                         if (recovery == TRUE ||
3350                             n4ep->error == ETIMEDOUT ||
3351                             n4ep->error == ECONNRESET)
3352                                 num_retries = 0;
3353                 } else if (retry_inuse && n4ep->error == 0 &&
3354                     n4ep->stat == NFS4ERR_CLID_INUSE) {
3355                         retry = TRUE;
3356                         num_retries = 0;
3357                 }
3358         } else {
3359                 /*
3360                  * Since everything succeeded give the list a reference count if
3361                  * it hasn't been given one by add_new_nfs4_server() or if this
3362                  * is not a recovery situation in which case it is already on
3363                  * the list.
3364                  */
3365                 mutex_enter(&np->s_lock);
3366                 if ((np->s_flags & N4S_INSERTED) == 0) {
3367                         np->s_refcnt++;
3368                         np->s_flags |= N4S_INSERTED;
3369                 }
3370                 mutex_exit(&np->s_lock);
3371         }
3372 
3373         if (!recovery)
3374                 nfs4_end_op(mi, NULL, NULL, &recov_state, recovery);
3375 
3376 
3377         if (retry && num_retries++ < nfs4_num_sclid_retries) {
3378                 if (retry_inuse) {
3379                         delay(SEC_TO_TICK(lease_time + nfs4_retry_sclid_delay));
3380                         retry_inuse = 0;
3381                 } else
3382                         delay(SEC_TO_TICK(nfs4_retry_sclid_delay));
3383 
3384                 nfs4_server_rele(np);
3385                 goto recov_retry;
3386         }
3387 
3388 
3389         if (n4ep->error == 0)
3390                 n4ep->error = geterrno4(n4ep->stat);
3391 
3392         /* broadcast before release in case no other threads are waiting */
3393         cv_broadcast(&np->s_clientid_pend);
3394         nfs4_server_rele(np);
3395 }
3396 
3397 int nfs4setclientid_otw_debug = 0;
3398 
3399 /*
3400  * This function handles the recovery of STALE_CLIENTID for SETCLIENTID_CONFRIM,
3401  * but nothing else; the calling function must be designed to handle those
3402  * other errors.
3403  */
3404 static void
3405 nfs4setclientid_otw(mntinfo4_t *mi, struct servinfo4 *svp,  cred_t *cr,
3406     struct nfs4_server *np, nfs4_error_t *ep, int *retry_inusep)
3407 {
3408         COMPOUND4args_clnt args;
3409         COMPOUND4res_clnt res;
3410         nfs_argop4 argop[3];
3411         SETCLIENTID4args *s_args;
3412         SETCLIENTID4resok *s_resok;
3413         int doqueue = 1;
3414         nfs4_ga_res_t *garp = NULL;
3415         timespec_t prop_time, after_time;
3416         verifier4 verf;
3417         clientid4 tmp_clientid;
3418 
3419         ASSERT(!MUTEX_HELD(&np->s_lock));
3420 
3421         args.ctag = TAG_SETCLIENTID;
3422 
3423         args.array = argop;
3424         args.array_len = 3;
3425 
3426         /* PUTROOTFH */
3427         argop[0].argop = OP_PUTROOTFH;
3428 
3429         /* GETATTR */
3430         argop[1].argop = OP_GETATTR;
3431         argop[1].nfs_argop4_u.opgetattr.attr_request = FATTR4_LEASE_TIME_MASK;
3432         argop[1].nfs_argop4_u.opgetattr.mi = mi;
3433 
3434         /* SETCLIENTID */
3435         argop[2].argop = OP_SETCLIENTID;
3436 
3437         s_args = &argop[2].nfs_argop4_u.opsetclientid;
3438 
3439         mutex_enter(&np->s_lock);
3440 
3441         s_args->client.verifier = np->clidtosend.verifier;
3442         s_args->client.id_len = np->clidtosend.id_len;
3443         ASSERT(s_args->client.id_len <= NFS4_OPAQUE_LIMIT);
3444         s_args->client.id_val = np->clidtosend.id_val;
3445 
3446         /*
3447          * Callback needs to happen on non-RDMA transport
3448          * Check if we have saved the original knetconfig
3449          * if so, use that instead.
3450          */
3451         if (svp->sv_origknconf != NULL)
3452                 nfs4_cb_args(np, svp->sv_origknconf, s_args);
3453         else
3454                 nfs4_cb_args(np, svp->sv_knconf, s_args);
3455 
3456         mutex_exit(&np->s_lock);
3457 
3458         rfs4call(mi, &args, &res, cr, &doqueue, 0, ep);
3459 
3460         if (ep->error)
3461                 return;
3462 
3463         /* getattr lease_time res */
3464         if ((res.array_len >= 2) &&
3465             (res.array[1].nfs_resop4_u.opgetattr.status == NFS4_OK)) {
3466                 garp = &res.array[1].nfs_resop4_u.opgetattr.ga_res;
3467 
3468 #ifndef _LP64
3469                 /*
3470                  * The 32 bit client cannot handle a lease time greater than
3471                  * (INT32_MAX/1000000).  This is due to the use of the
3472                  * lease_time in calls to drv_usectohz() in
3473                  * nfs4_renew_lease_thread().  The problem is that
3474                  * drv_usectohz() takes a time_t (which is just a long = 4
3475                  * bytes) as its parameter.  The lease_time is multiplied by
3476                  * 1000000 to convert seconds to usecs for the parameter.  If
3477                  * a number bigger than (INT32_MAX/1000000) is used then we
3478                  * overflow on the 32bit client.
3479                  */
3480                 if (garp->n4g_ext_res->n4g_leasetime > (INT32_MAX/1000000)) {
3481                         garp->n4g_ext_res->n4g_leasetime = INT32_MAX/1000000;
3482                 }
3483 #endif
3484 
3485                 mutex_enter(&np->s_lock);
3486                 np->s_lease_time = garp->n4g_ext_res->n4g_leasetime;
3487 
3488                 /*
3489                  * Keep track of the lease period for the mi's
3490                  * mi_msg_list.  We need an appropiate time
3491                  * bound to associate past facts with a current
3492                  * event.  The lease period is perfect for this.
3493                  */
3494                 mutex_enter(&mi->mi_msg_list_lock);
3495                 mi->mi_lease_period = np->s_lease_time;
3496                 mutex_exit(&mi->mi_msg_list_lock);
3497                 mutex_exit(&np->s_lock);
3498         }
3499 
3500 
3501         if (res.status == NFS4ERR_CLID_INUSE) {
3502                 clientaddr4 *clid_inuse;
3503 
3504                 if (!(*retry_inusep)) {
3505                         clid_inuse = &res.array->nfs_resop4_u.
3506                             opsetclientid.SETCLIENTID4res_u.client_using;
3507 
3508                         zcmn_err(mi->mi_zone->zone_id, CE_NOTE,
3509                             "NFS4 mount (SETCLIENTID failed)."
3510                             "  nfs4_client_id.id is in"
3511                             "use already by: r_netid<%s> r_addr<%s>",
3512                             clid_inuse->r_netid, clid_inuse->r_addr);
3513                 }
3514 
3515                 /*
3516                  * XXX - The client should be more robust in its
3517                  * handling of clientid in use errors (regen another
3518                  * clientid and try again?)
3519                  */
3520                 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3521                 return;
3522         }
3523 
3524         if (res.status) {
3525                 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3526                 return;
3527         }
3528 
3529         s_resok = &res.array[2].nfs_resop4_u.
3530             opsetclientid.SETCLIENTID4res_u.resok4;
3531 
3532         tmp_clientid = s_resok->clientid;
3533 
3534         verf = s_resok->setclientid_confirm;
3535 
3536 #ifdef  DEBUG
3537         if (nfs4setclientid_otw_debug) {
3538                 union {
3539                         clientid4       clientid;
3540                         int             foo[2];
3541                 } cid;
3542 
3543                 cid.clientid = s_resok->clientid;
3544 
3545                 zcmn_err(mi->mi_zone->zone_id, CE_NOTE,
3546                 "nfs4setclientid_otw: OK, clientid = %x,%x, "
3547                 "verifier = %" PRIx64 "\n", cid.foo[0], cid.foo[1], verf);
3548         }
3549 #endif
3550 
3551         (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3552 
3553         /* Confirm the client id and get the lease_time attribute */
3554 
3555         args.ctag = TAG_SETCLIENTID_CF;
3556 
3557         args.array = argop;
3558         args.array_len = 1;
3559 
3560         argop[0].argop = OP_SETCLIENTID_CONFIRM;
3561 
3562         argop[0].nfs_argop4_u.opsetclientid_confirm.clientid = tmp_clientid;
3563         argop[0].nfs_argop4_u.opsetclientid_confirm.setclientid_confirm = verf;
3564 
3565         /* used to figure out RTT for np */
3566         gethrestime(&prop_time);
3567 
3568         NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "nfs4setlientid_otw: "
3569             "start time: %ld sec %ld nsec", prop_time.tv_sec,
3570             prop_time.tv_nsec));
3571 
3572         rfs4call(mi, &args, &res, cr, &doqueue, 0, ep);
3573 
3574         gethrestime(&after_time);
3575         mutex_enter(&np->s_lock);
3576         np->propagation_delay.tv_sec =
3577             MAX(1, after_time.tv_sec - prop_time.tv_sec);
3578         mutex_exit(&np->s_lock);
3579 
3580         NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "nfs4setlcientid_otw: "
3581             "finish time: %ld sec ", after_time.tv_sec));
3582 
3583         NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "nfs4setclientid_otw: "
3584             "propagation delay set to %ld sec",
3585             np->propagation_delay.tv_sec));
3586 
3587         if (ep->error)
3588                 return;
3589 
3590         if (res.status == NFS4ERR_CLID_INUSE) {
3591                 clientaddr4 *clid_inuse;
3592 
3593                 if (!(*retry_inusep)) {
3594                         clid_inuse = &res.array->nfs_resop4_u.
3595                             opsetclientid.SETCLIENTID4res_u.client_using;
3596 
3597                         zcmn_err(mi->mi_zone->zone_id, CE_NOTE,
3598                             "SETCLIENTID_CONFIRM failed.  "
3599                             "nfs4_client_id.id is in use already by: "
3600                             "r_netid<%s> r_addr<%s>",
3601                             clid_inuse->r_netid, clid_inuse->r_addr);
3602                 }
3603 
3604                 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3605                 return;
3606         }
3607 
3608         if (res.status) {
3609                 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3610                 return;
3611         }
3612 
3613         mutex_enter(&np->s_lock);
3614         np->clientid = tmp_clientid;
3615         np->s_flags |= N4S_CLIENTID_SET;
3616 
3617         /* Add mi to np's mntinfo4 list */
3618         nfs4_add_mi_to_server(np, mi);
3619 
3620         if (np->lease_valid == NFS4_LEASE_NOT_STARTED) {
3621                 /*
3622                  * Start lease management thread.
3623                  * Keep trying until we succeed.
3624                  */
3625 
3626                 np->s_refcnt++;              /* pass reference to thread */
3627                 (void) zthread_create(NULL, 0, nfs4_renew_lease_thread, np, 0,
3628                     minclsyspri);
3629         }
3630         mutex_exit(&np->s_lock);
3631 
3632         (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3633 }
3634 
3635 /*
3636  * Add mi to sp's mntinfo4_list if it isn't already in the list.  Makes
3637  * mi's clientid the same as sp's.
3638  * Assumes sp is locked down.
3639  */
3640 void
3641 nfs4_add_mi_to_server(nfs4_server_t *sp, mntinfo4_t *mi)
3642 {
3643         mntinfo4_t *tmi;
3644         int in_list = 0;
3645 
3646         ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
3647             nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
3648         ASSERT(sp != &nfs4_server_lst);
3649         ASSERT(MUTEX_HELD(&sp->s_lock));
3650 
3651         NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3652             "nfs4_add_mi_to_server: add mi %p to sp %p",
3653             (void*)mi, (void*)sp));
3654 
3655         for (tmi = sp->mntinfo4_list;
3656             tmi != NULL;
3657             tmi = tmi->mi_clientid_next) {
3658                 if (tmi == mi) {
3659                         NFS4_DEBUG(nfs4_client_lease_debug,
3660                             (CE_NOTE,
3661                             "nfs4_add_mi_to_server: mi in list"));
3662                         in_list = 1;
3663                 }
3664         }
3665 
3666         /*
3667          * First put a hold on the mntinfo4's vfsp so that references via
3668          * mntinfo4_list will be valid.
3669          */
3670         if (!in_list)
3671                 VFS_HOLD(mi->mi_vfsp);
3672 
3673         NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "nfs4_add_mi_to_server: "
3674             "hold vfs %p for mi: %p", (void*)mi->mi_vfsp, (void*)mi));
3675 
3676         if (!in_list) {
3677                 if (sp->mntinfo4_list)
3678                         sp->mntinfo4_list->mi_clientid_prev = mi;
3679                 mi->mi_clientid_next = sp->mntinfo4_list;
3680                 mi->mi_srv = sp;
3681                 sp->mntinfo4_list = mi;
3682                 mi->mi_srvsettime = gethrestime_sec();
3683                 mi->mi_srvset_cnt++;
3684         }
3685 
3686         /* set mi's clientid to that of sp's for later matching */
3687         mi->mi_clientid = sp->clientid;
3688 
3689         /*
3690          * Update the clientid for any other mi's belonging to sp.  This
3691          * must be done here while we hold sp->s_lock, so that
3692          * find_nfs4_server() continues to work.
3693          */
3694 
3695         for (tmi = sp->mntinfo4_list;
3696             tmi != NULL;
3697             tmi = tmi->mi_clientid_next) {
3698                 if (tmi != mi) {
3699                         tmi->mi_clientid = sp->clientid;
3700                 }
3701         }
3702 }
3703 
3704 /*
3705  * Remove the mi from sp's mntinfo4_list and release its reference.
3706  * Exception: if mi still has open files, flag it for later removal (when
3707  * all the files are closed).
3708  *
3709  * If this is the last mntinfo4 in sp's list then tell the lease renewal
3710  * thread to exit.
3711  */
3712 static void
3713 nfs4_remove_mi_from_server_nolock(mntinfo4_t *mi, nfs4_server_t *sp)
3714 {
3715         NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3716             "nfs4_remove_mi_from_server_nolock: remove mi %p from sp %p",
3717             (void*)mi, (void*)sp));
3718 
3719         ASSERT(sp != NULL);
3720         ASSERT(MUTEX_HELD(&sp->s_lock));
3721         ASSERT(mi->mi_open_files >= 0);
3722 
3723         /*
3724          * First make sure this mntinfo4 can be taken off of the list,
3725          * ie: it doesn't have any open files remaining.
3726          */
3727         if (mi->mi_open_files > 0) {
3728                 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3729                     "nfs4_remove_mi_from_server_nolock: don't "
3730                     "remove mi since it still has files open"));
3731 
3732                 mutex_enter(&mi->mi_lock);
3733                 mi->mi_flags |= MI4_REMOVE_ON_LAST_CLOSE;
3734                 mutex_exit(&mi->mi_lock);
3735                 return;
3736         }
3737 
3738         VFS_HOLD(mi->mi_vfsp);
3739         remove_mi(sp, mi);
3740         VFS_RELE(mi->mi_vfsp);
3741 
3742         if (sp->mntinfo4_list == NULL) {
3743                 /* last fs unmounted, kill the thread */
3744                 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3745                     "remove_mi_from_nfs4_server_nolock: kill the thread"));
3746                 nfs4_mark_srv_dead(sp);
3747         }
3748 }
3749 
3750 /*
3751  * Remove mi from sp's mntinfo4_list and release the vfs reference.
3752  */
3753 static void
3754 remove_mi(nfs4_server_t *sp, mntinfo4_t *mi)
3755 {
3756         ASSERT(MUTEX_HELD(&sp->s_lock));
3757 
3758         /*
3759          * We release a reference, and the caller must still have a
3760          * reference.
3761          */
3762         ASSERT(mi->mi_vfsp->vfs_count >= 2);
3763 
3764         if (mi->mi_clientid_prev) {
3765                 mi->mi_clientid_prev->mi_clientid_next = mi->mi_clientid_next;
3766         } else {
3767                 /* This is the first mi in sp's mntinfo4_list */
3768                 /*
3769                  * Make sure the first mntinfo4 in the list is the actual
3770                  * mntinfo4 passed in.
3771                  */
3772                 ASSERT(sp->mntinfo4_list == mi);
3773 
3774                 sp->mntinfo4_list = mi->mi_clientid_next;
3775         }
3776         if (mi->mi_clientid_next)
3777                 mi->mi_clientid_next->mi_clientid_prev = mi->mi_clientid_prev;
3778 
3779         /* Now mark the mntinfo4's links as being removed */
3780         mi->mi_clientid_prev = mi->mi_clientid_next = NULL;
3781         mi->mi_srv = NULL;
3782         mi->mi_srvset_cnt++;
3783 
3784         VFS_RELE(mi->mi_vfsp);
3785 }
3786 
3787 /*
3788  * Free all the entries in sp's mntinfo4_list.
3789  */
3790 static void
3791 remove_all_mi(nfs4_server_t *sp)
3792 {
3793         mntinfo4_t *mi;
3794 
3795         ASSERT(MUTEX_HELD(&sp->s_lock));
3796 
3797         while (sp->mntinfo4_list != NULL) {
3798                 mi = sp->mntinfo4_list;
3799                 /*
3800                  * Grab a reference in case there is only one left (which
3801                  * remove_mi() frees).
3802                  */
3803                 VFS_HOLD(mi->mi_vfsp);
3804                 remove_mi(sp, mi);
3805                 VFS_RELE(mi->mi_vfsp);
3806         }
3807 }
3808 
3809 /*
3810  * Remove the mi from sp's mntinfo4_list as above, and rele the vfs.
3811  *
3812  * This version can be called with a null nfs4_server_t arg,
3813  * and will either find the right one and handle locking, or
3814  * do nothing because the mi wasn't added to an sp's mntinfo4_list.
3815  */
3816 void
3817 nfs4_remove_mi_from_server(mntinfo4_t *mi, nfs4_server_t *esp)
3818 {
3819         nfs4_server_t   *sp;
3820 
3821         if (esp) {
3822                 nfs4_remove_mi_from_server_nolock(mi, esp);
3823                 return;
3824         }
3825 
3826         (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0);
3827         if (sp = find_nfs4_server_all(mi, 1)) {
3828                 nfs4_remove_mi_from_server_nolock(mi, sp);
3829                 mutex_exit(&sp->s_lock);
3830                 nfs4_server_rele(sp);
3831         }
3832         nfs_rw_exit(&mi->mi_recovlock);
3833 }
3834 
3835 /*
3836  * Return TRUE if the given server has any non-unmounted filesystems.
3837  */
3838 
3839 bool_t
3840 nfs4_fs_active(nfs4_server_t *sp)
3841 {
3842         mntinfo4_t *mi;
3843 
3844         ASSERT(MUTEX_HELD(&sp->s_lock));
3845 
3846         for (mi = sp->mntinfo4_list; mi != NULL; mi = mi->mi_clientid_next) {
3847                 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
3848                         return (TRUE);
3849         }
3850 
3851         return (FALSE);
3852 }
3853 
3854 /*
3855  * Mark sp as finished and notify any waiters.
3856  */
3857 
3858 void
3859 nfs4_mark_srv_dead(nfs4_server_t *sp)
3860 {
3861         ASSERT(MUTEX_HELD(&sp->s_lock));
3862 
3863         sp->s_thread_exit = NFS4_THREAD_EXIT;
3864         cv_broadcast(&sp->cv_thread_exit);
3865 }
3866 
3867 /*
3868  * Create a new nfs4_server_t structure.
3869  * Returns new node unlocked and not in list, but with a reference count of
3870  * 1.
3871  */
3872 struct nfs4_server *
3873 new_nfs4_server(struct servinfo4 *svp, cred_t *cr)
3874 {
3875         struct nfs4_server *np;
3876         timespec_t tt;
3877         union {
3878                 struct {
3879                         uint32_t sec;
3880                         uint32_t subsec;
3881                 } un_curtime;
3882                 verifier4       un_verifier;
3883         } nfs4clientid_verifier;
3884         /*
3885          * We change this ID string carefully and with the Solaris
3886          * NFS server behaviour in mind.  "+referrals" indicates
3887          * a client that can handle an NFSv4 referral.
3888          */
3889         char id_val[] = "Solaris: %s, NFSv4 kernel client +referrals";
3890         int len;
3891 
3892         np = kmem_zalloc(sizeof (struct nfs4_server), KM_SLEEP);
3893         np->saddr.len = svp->sv_addr.len;
3894         np->saddr.maxlen = svp->sv_addr.maxlen;
3895         np->saddr.buf = kmem_alloc(svp->sv_addr.maxlen, KM_SLEEP);
3896         bcopy(svp->sv_addr.buf, np->saddr.buf, svp->sv_addr.len);
3897         np->s_refcnt = 1;
3898 
3899         /*
3900          * Build the nfs_client_id4 for this server mount.  Ensure
3901          * the verifier is useful and that the identification is
3902          * somehow based on the server's address for the case of
3903          * multi-homed servers.
3904          */
3905         nfs4clientid_verifier.un_verifier = 0;
3906         gethrestime(&tt);
3907         nfs4clientid_verifier.un_curtime.sec = (uint32_t)tt.tv_sec;
3908         nfs4clientid_verifier.un_curtime.subsec = (uint32_t)tt.tv_nsec;
3909         np->clidtosend.verifier = nfs4clientid_verifier.un_verifier;
3910 
3911         /*
3912          * calculate the length of the opaque identifier.  Subtract 2
3913          * for the "%s" and add the traditional +1 for null
3914          * termination.
3915          */
3916         len = strlen(id_val) - 2 + strlen(uts_nodename()) + 1;
3917         np->clidtosend.id_len = len + np->saddr.maxlen;
3918 
3919         np->clidtosend.id_val = kmem_alloc(np->clidtosend.id_len, KM_SLEEP);
3920         (void) sprintf(np->clidtosend.id_val, id_val, uts_nodename());
3921         bcopy(np->saddr.buf, &np->clidtosend.id_val[len], np->saddr.len);
3922 
3923         np->s_flags = 0;
3924         np->mntinfo4_list = NULL;
3925         /* save cred for issuing rfs4calls inside the renew thread */
3926         crhold(cr);
3927         np->s_cred = cr;
3928         cv_init(&np->cv_thread_exit, NULL, CV_DEFAULT, NULL);
3929         mutex_init(&np->s_lock, NULL, MUTEX_DEFAULT, NULL);
3930         nfs_rw_init(&np->s_recovlock, NULL, RW_DEFAULT, NULL);
3931         list_create(&np->s_deleg_list, sizeof (rnode4_t),
3932             offsetof(rnode4_t, r_deleg_link));
3933         np->s_thread_exit = 0;
3934         np->state_ref_count = 0;
3935         np->lease_valid = NFS4_LEASE_NOT_STARTED;
3936         cv_init(&np->s_cv_otw_count, NULL, CV_DEFAULT, NULL);
3937         cv_init(&np->s_clientid_pend, NULL, CV_DEFAULT, NULL);
3938         np->s_otw_call_count = 0;
3939         cv_init(&np->wait_cb_null, NULL, CV_DEFAULT, NULL);
3940         np->zoneid = getzoneid();
3941         np->zone_globals = nfs4_get_callback_globals();
3942         ASSERT(np->zone_globals != NULL);
3943         return (np);
3944 }
3945 
3946 /*
3947  * Create a new nfs4_server_t structure and add it to the list.
3948  * Returns new node locked; reference must eventually be freed.
3949  */
3950 static struct nfs4_server *
3951 add_new_nfs4_server(struct servinfo4 *svp, cred_t *cr)
3952 {
3953         nfs4_server_t *sp;
3954 
3955         ASSERT(MUTEX_HELD(&nfs4_server_lst_lock));
3956         sp = new_nfs4_server(svp, cr);
3957         mutex_enter(&sp->s_lock);
3958         insque(sp, &nfs4_server_lst);
3959         sp->s_refcnt++;                      /* list gets a reference */
3960         sp->s_flags |= N4S_INSERTED;
3961         sp->clientid = 0;
3962         return (sp);
3963 }
3964 
3965 int nfs4_server_t_debug = 0;
3966 
3967 #ifdef lint
3968 extern void
3969 dumpnfs4slist(char *, mntinfo4_t *, clientid4, servinfo4_t *);
3970 #endif
3971 
3972 #ifndef lint
3973 #ifdef DEBUG
3974 void
3975 dumpnfs4slist(char *txt, mntinfo4_t *mi, clientid4 clientid, servinfo4_t *srv_p)
3976 {
3977         int hash16(void *p, int len);
3978         nfs4_server_t *np;
3979 
3980         NFS4_DEBUG(nfs4_server_t_debug, (CE_NOTE,
3981             "dumping nfs4_server_t list in %s", txt));
3982         NFS4_DEBUG(nfs4_server_t_debug, (CE_CONT,
3983             "mi 0x%p, want clientid %llx, addr %d/%04X",
3984             mi, (longlong_t)clientid, srv_p->sv_addr.len,
3985             hash16((void *)srv_p->sv_addr.buf, srv_p->sv_addr.len)));
3986         for (np = nfs4_server_lst.forw; np != &nfs4_server_lst;
3987             np = np->forw) {
3988                 NFS4_DEBUG(nfs4_server_t_debug, (CE_CONT,
3989                     "node 0x%p,    clientid %llx, addr %d/%04X, cnt %d",
3990                     np, (longlong_t)np->clientid, np->saddr.len,
3991                     hash16((void *)np->saddr.buf, np->saddr.len),
3992                     np->state_ref_count));
3993                 if (np->saddr.len == srv_p->sv_addr.len &&
3994                     bcmp(np->saddr.buf, srv_p->sv_addr.buf,
3995                     np->saddr.len) == 0)
3996                         NFS4_DEBUG(nfs4_server_t_debug, (CE_CONT,
3997                             " - address matches"));
3998                 if (np->clientid == clientid || np->clientid == 0)
3999                         NFS4_DEBUG(nfs4_server_t_debug, (CE_CONT,
4000                             " - clientid matches"));
4001                 if (np->s_thread_exit != NFS4_THREAD_EXIT)
4002                         NFS4_DEBUG(nfs4_server_t_debug, (CE_CONT,
4003                             " - thread not exiting"));
4004         }
4005         delay(hz);
4006 }
4007 #endif
4008 #endif
4009 
4010 
4011 /*
4012  * Move a mntinfo4_t from one server list to another.
4013  * Locking of the two nfs4_server_t nodes will be done in list order.
4014  *
4015  * Returns NULL if the current nfs4_server_t for the filesystem could not
4016  * be found (e.g., due to forced unmount).  Otherwise returns a reference
4017  * to the new nfs4_server_t, which must eventually be freed.
4018  */
4019 nfs4_server_t *
4020 nfs4_move_mi(mntinfo4_t *mi, servinfo4_t *old, servinfo4_t *new)
4021 {
4022         nfs4_server_t *p, *op = NULL, *np = NULL;
4023         int num_open;
4024         zoneid_t zoneid = nfs_zoneid();
4025 
4026         ASSERT(nfs_zone() == mi->mi_zone);
4027 
4028         mutex_enter(&nfs4_server_lst_lock);
4029 #ifdef DEBUG
4030         if (nfs4_server_t_debug)
4031                 dumpnfs4slist("nfs4_move_mi", mi, (clientid4)0, new);
4032 #endif
4033         for (p = nfs4_server_lst.forw; p != &nfs4_server_lst; p = p->forw) {
4034                 if (p->zoneid != zoneid)
4035                         continue;
4036                 if (p->saddr.len == old->sv_addr.len &&
4037                     bcmp(p->saddr.buf, old->sv_addr.buf, p->saddr.len) == 0 &&
4038                     p->s_thread_exit != NFS4_THREAD_EXIT) {
4039                         op = p;
4040                         mutex_enter(&op->s_lock);
4041                         op->s_refcnt++;
4042                 }
4043                 if (p->saddr.len == new->sv_addr.len &&
4044                     bcmp(p->saddr.buf, new->sv_addr.buf, p->saddr.len) == 0 &&
4045                     p->s_thread_exit != NFS4_THREAD_EXIT) {
4046                         np = p;
4047                         mutex_enter(&np->s_lock);
4048                 }
4049                 if (op != NULL && np != NULL)
4050                         break;
4051         }
4052         if (op == NULL) {
4053                 /*
4054                  * Filesystem has been forcibly unmounted.  Bail out.
4055                  */
4056                 if (np != NULL)
4057                         mutex_exit(&np->s_lock);
4058                 mutex_exit(&nfs4_server_lst_lock);
4059                 return (NULL);
4060         }
4061         if (np != NULL) {
4062                 np->s_refcnt++;
4063         } else {
4064 #ifdef DEBUG
4065                 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
4066                     "nfs4_move_mi: no target nfs4_server, will create."));
4067 #endif
4068                 np = add_new_nfs4_server(new, kcred);
4069         }
4070         mutex_exit(&nfs4_server_lst_lock);
4071 
4072         NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
4073             "nfs4_move_mi: for mi 0x%p, "
4074             "old servinfo4 0x%p, new servinfo4 0x%p, "
4075             "old nfs4_server 0x%p, new nfs4_server 0x%p, ",
4076             (void*)mi, (void*)old, (void*)new,
4077             (void*)op, (void*)np));
4078         ASSERT(op != NULL && np != NULL);
4079 
4080         /* discard any delegations */
4081         nfs4_deleg_discard(mi, op);
4082 
4083         num_open = mi->mi_open_files;
4084         mi->mi_open_files = 0;
4085         op->state_ref_count -= num_open;
4086         ASSERT(op->state_ref_count >= 0);
4087         np->state_ref_count += num_open;
4088         nfs4_remove_mi_from_server_nolock(mi, op);
4089         mi->mi_open_files = num_open;
4090         NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
4091             "nfs4_move_mi: mi_open_files %d, op->cnt %d, np->cnt %d",
4092             mi->mi_open_files, op->state_ref_count, np->state_ref_count));
4093 
4094         nfs4_add_mi_to_server(np, mi);
4095 
4096         mutex_exit(&op->s_lock);
4097         mutex_exit(&np->s_lock);
4098         nfs4_server_rele(op);
4099 
4100         return (np);
4101 }
4102 
4103 /*
4104  * Need to have the nfs4_server_lst_lock.
4105  * Search the nfs4_server list to find a match on this servinfo4
4106  * based on its address.
4107  *
4108  * Returns NULL if no match is found.  Otherwise returns a reference (which
4109  * must eventually be freed) to a locked nfs4_server.
4110  */
4111 nfs4_server_t *
4112 servinfo4_to_nfs4_server(servinfo4_t *srv_p)
4113 {
4114         nfs4_server_t *np;
4115         zoneid_t zoneid = nfs_zoneid();
4116 
4117         ASSERT(MUTEX_HELD(&nfs4_server_lst_lock));
4118         for (np = nfs4_server_lst.forw; np != &nfs4_server_lst; np = np->forw) {
4119                 if (np->zoneid == zoneid &&
4120                     np->saddr.len == srv_p->sv_addr.len &&
4121                     bcmp(np->saddr.buf, srv_p->sv_addr.buf,
4122                     np->saddr.len) == 0 &&
4123                     np->s_thread_exit != NFS4_THREAD_EXIT) {
4124                         mutex_enter(&np->s_lock);
4125                         np->s_refcnt++;
4126                         return (np);
4127                 }
4128         }
4129         return (NULL);
4130 }
4131 
4132 /*
4133  * Locks the nfs4_server down if it is found and returns a reference that
4134  * must eventually be freed.
4135  */
4136 static nfs4_server_t *
4137 lookup_nfs4_server(nfs4_server_t *sp, int any_state)
4138 {
4139         nfs4_server_t *np;
4140 
4141         mutex_enter(&nfs4_server_lst_lock);
4142         for (np = nfs4_server_lst.forw; np != &nfs4_server_lst; np = np->forw) {
4143                 mutex_enter(&np->s_lock);
4144                 if (np == sp && np->s_refcnt > 0 &&
4145                     (np->s_thread_exit != NFS4_THREAD_EXIT || any_state)) {
4146                         mutex_exit(&nfs4_server_lst_lock);
4147                         np->s_refcnt++;
4148                         return (np);
4149                 }
4150                 mutex_exit(&np->s_lock);
4151         }
4152         mutex_exit(&nfs4_server_lst_lock);
4153 
4154         return (NULL);
4155 }
4156 
4157 /*
4158  * The caller should be holding mi->mi_recovlock, and it should continue to
4159  * hold the lock until done with the returned nfs4_server_t.  Once
4160  * mi->mi_recovlock is released, there is no guarantee that the returned
4161  * mi->nfs4_server_t will continue to correspond to mi.
4162  */
4163 nfs4_server_t *
4164 find_nfs4_server(mntinfo4_t *mi)
4165 {
4166         ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
4167             nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
4168 
4169         return (lookup_nfs4_server(mi->mi_srv, 0));
4170 }
4171 
4172 /*
4173  * Same as above, but takes an "any_state" parameter which can be
4174  * set to 1 if the caller wishes to find nfs4_server_t's which
4175  * have been marked for termination by the exit of the renew
4176  * thread.  This should only be used by operations which are
4177  * cleaning up and will not cause an OTW op.
4178  */
4179 nfs4_server_t *
4180 find_nfs4_server_all(mntinfo4_t *mi, int any_state)
4181 {
4182         ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
4183             nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
4184 
4185         return (lookup_nfs4_server(mi->mi_srv, any_state));
4186 }
4187 
4188 /*
4189  * Lock sp, but only if it's still active (in the list and hasn't been
4190  * flagged as exiting) or 'any_state' is non-zero.
4191  * Returns TRUE if sp got locked and adds a reference to sp.
4192  */
4193 bool_t
4194 nfs4_server_vlock(nfs4_server_t *sp, int any_state)
4195 {
4196         return (lookup_nfs4_server(sp, any_state) != NULL);
4197 }
4198 
4199 /*
4200  * Release the reference to sp and destroy it if that's the last one.
4201  */
4202 
4203 void
4204 nfs4_server_rele(nfs4_server_t *sp)
4205 {
4206         mutex_enter(&sp->s_lock);
4207         ASSERT(sp->s_refcnt > 0);
4208         sp->s_refcnt--;
4209         if (sp->s_refcnt > 0) {
4210                 mutex_exit(&sp->s_lock);
4211                 return;
4212         }
4213         mutex_exit(&sp->s_lock);
4214 
4215         mutex_enter(&nfs4_server_lst_lock);
4216         mutex_enter(&sp->s_lock);
4217         if (sp->s_refcnt > 0) {
4218                 mutex_exit(&sp->s_lock);
4219                 mutex_exit(&nfs4_server_lst_lock);
4220                 return;
4221         }
4222         remque(sp);
4223         sp->forw = sp->back = NULL;
4224         mutex_exit(&nfs4_server_lst_lock);
4225         destroy_nfs4_server(sp);
4226 }
4227 
4228 static void
4229 destroy_nfs4_server(nfs4_server_t *sp)
4230 {
4231         ASSERT(MUTEX_HELD(&sp->s_lock));
4232         ASSERT(sp->s_refcnt == 0);
4233         ASSERT(sp->s_otw_call_count == 0);
4234 
4235         remove_all_mi(sp);
4236 
4237         crfree(sp->s_cred);
4238         kmem_free(sp->saddr.buf, sp->saddr.maxlen);
4239         kmem_free(sp->clidtosend.id_val, sp->clidtosend.id_len);
4240         mutex_exit(&sp->s_lock);
4241 
4242         /* destroy the nfs4_server */
4243         nfs4callback_destroy(sp);
4244         list_destroy(&sp->s_deleg_list);
4245         mutex_destroy(&sp->s_lock);
4246         cv_destroy(&sp->cv_thread_exit);
4247         cv_destroy(&sp->s_cv_otw_count);
4248         cv_destroy(&sp->s_clientid_pend);
4249         cv_destroy(&sp->wait_cb_null);
4250         nfs_rw_destroy(&sp->s_recovlock);
4251         kmem_free(sp, sizeof (*sp));
4252 }
4253 
4254 /*
4255  * Fork off a thread to free the data structures for a mount.
4256  */
4257 
4258 static void
4259 async_free_mount(vfs_t *vfsp, int flag, cred_t *cr)
4260 {
4261         freemountargs_t *args;
4262         args = kmem_alloc(sizeof (freemountargs_t), KM_SLEEP);
4263         args->fm_vfsp = vfsp;
4264         VFS_HOLD(vfsp);
4265         MI4_HOLD(VFTOMI4(vfsp));
4266         args->fm_flag = flag;
4267         args->fm_cr = cr;
4268         crhold(cr);
4269         (void) zthread_create(NULL, 0, nfs4_free_mount_thread, args, 0,
4270             minclsyspri);
4271 }
4272 
4273 static void
4274 nfs4_free_mount_thread(freemountargs_t *args)
4275 {
4276         mntinfo4_t *mi;
4277         nfs4_free_mount(args->fm_vfsp, args->fm_flag, args->fm_cr);
4278         mi = VFTOMI4(args->fm_vfsp);
4279         crfree(args->fm_cr);
4280         VFS_RELE(args->fm_vfsp);
4281         MI4_RELE(mi);
4282         kmem_free(args, sizeof (freemountargs_t));
4283         zthread_exit();
4284         /* NOTREACHED */
4285 }
4286 
4287 /*
4288  * Thread to free the data structures for a given filesystem.
4289  */
4290 static void
4291 nfs4_free_mount(vfs_t *vfsp, int flag, cred_t *cr)
4292 {
4293         mntinfo4_t              *mi = VFTOMI4(vfsp);
4294         nfs4_server_t           *sp;
4295         callb_cpr_t             cpr_info;
4296         kmutex_t                cpr_lock;
4297         boolean_t               async_thread;
4298         int                     removed;
4299 
4300         bool_t                  must_unlock;
4301         nfs4_ephemeral_tree_t   *eph_tree;
4302 
4303         /*
4304          * We need to participate in the CPR framework if this is a kernel
4305          * thread.
4306          */
4307         async_thread = (curproc == nfs_zone()->zone_zsched);
4308         if (async_thread) {
4309                 mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL);
4310                 CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr,
4311                     "nfsv4AsyncUnmount");
4312         }
4313 
4314         /*
4315          * We need to wait for all outstanding OTW calls
4316          * and recovery to finish before we remove the mi
4317          * from the nfs4_server_t, as current pending
4318          * calls might still need this linkage (in order
4319          * to find a nfs4_server_t from a mntinfo4_t).
4320          */
4321         (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, FALSE);
4322         sp = find_nfs4_server(mi);
4323         nfs_rw_exit(&mi->mi_recovlock);
4324 
4325         if (sp) {
4326                 while (sp->s_otw_call_count != 0) {
4327                         if (async_thread) {
4328                                 mutex_enter(&cpr_lock);
4329                                 CALLB_CPR_SAFE_BEGIN(&cpr_info);
4330                                 mutex_exit(&cpr_lock);
4331                         }
4332                         cv_wait(&sp->s_cv_otw_count, &sp->s_lock);
4333                         if (async_thread) {
4334                                 mutex_enter(&cpr_lock);
4335                                 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
4336                                 mutex_exit(&cpr_lock);
4337                         }
4338                 }
4339                 mutex_exit(&sp->s_lock);
4340                 nfs4_server_rele(sp);
4341                 sp = NULL;
4342         }
4343 
4344         mutex_enter(&mi->mi_lock);
4345         while (mi->mi_in_recovery != 0) {
4346                 if (async_thread) {
4347                         mutex_enter(&cpr_lock);
4348                         CALLB_CPR_SAFE_BEGIN(&cpr_info);
4349                         mutex_exit(&cpr_lock);
4350                 }
4351                 cv_wait(&mi->mi_cv_in_recov, &mi->mi_lock);
4352                 if (async_thread) {
4353                         mutex_enter(&cpr_lock);
4354                         CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
4355                         mutex_exit(&cpr_lock);
4356                 }
4357         }
4358         mutex_exit(&mi->mi_lock);
4359 
4360         /*
4361          * If we got an error, then do not nuke the
4362          * tree. Either the harvester is busy reclaiming
4363          * this node or we ran into some busy condition.
4364          *
4365          * The harvester will eventually come along and cleanup.
4366          * The only problem would be the root mount point.
4367          *
4368          * Since the busy node can occur for a variety
4369          * of reasons and can result in an entry staying
4370          * in df output but no longer accessible from the
4371          * directory tree, we are okay.
4372          */
4373         if (!nfs4_ephemeral_umount(mi, flag, cr,
4374             &must_unlock, &eph_tree))
4375                 nfs4_ephemeral_umount_activate(mi, &must_unlock,
4376                     &eph_tree);
4377 
4378         /*
4379          * The original purge of the dnlc via 'dounmount'
4380          * doesn't guarantee that another dnlc entry was not
4381          * added while we waitied for all outstanding OTW
4382          * and recovery calls to finish.  So re-purge the
4383          * dnlc now.
4384          */
4385         (void) dnlc_purge_vfsp(vfsp, 0);
4386 
4387         /*
4388          * We need to explicitly stop the manager thread; the asyc worker
4389          * threads can timeout and exit on their own.
4390          */
4391         mutex_enter(&mi->mi_async_lock);
4392         mi->mi_max_threads = 0;
4393         NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
4394         mutex_exit(&mi->mi_async_lock);
4395         if (mi->mi_manager_thread)
4396                 nfs4_async_manager_stop(vfsp);
4397 
4398         destroy_rtable4(vfsp, cr);
4399 
4400         nfs4_remove_mi_from_server(mi, NULL);
4401 
4402         if (async_thread) {
4403                 mutex_enter(&cpr_lock);
4404                 CALLB_CPR_EXIT(&cpr_info);  /* drops cpr_lock */
4405                 mutex_destroy(&cpr_lock);
4406         }
4407 
4408         removed = nfs4_mi_zonelist_remove(mi);
4409         if (removed)
4410                 zone_rele_ref(&mi->mi_zone_ref, ZONE_REF_NFSV4);
4411 }
4412 
4413 /* Referral related sub-routines */
4414 
4415 /* Freeup knetconfig */
4416 static void
4417 free_knconf_contents(struct knetconfig *k)
4418 {
4419         if (k == NULL)
4420                 return;
4421         if (k->knc_protofmly)
4422                 kmem_free(k->knc_protofmly, KNC_STRSIZE);
4423         if (k->knc_proto)
4424                 kmem_free(k->knc_proto, KNC_STRSIZE);
4425 }
4426 
4427 /*
4428  * This updates newpath variable with exact name component from the
4429  * path which gave us a NFS4ERR_MOVED error.
4430  * If the path is /rp/aaa/bbb and nth value is 1, aaa is returned.
4431  */
4432 static char *
4433 extract_referral_point(const char *svp, int nth)
4434 {
4435         int num_slashes = 0;
4436         const char *p;
4437         char *newpath = NULL;
4438         int i = 0;
4439 
4440         newpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
4441         for (p = svp; *p; p++) {
4442                 if (*p == '/')
4443                         num_slashes++;
4444                 if (num_slashes == nth + 1) {
4445                         p++;
4446                         while (*p != '/') {
4447                                 if (*p == '\0')
4448                                         break;
4449                                 newpath[i] = *p;
4450                                 i++;
4451                                 p++;
4452                         }
4453                         newpath[i++] = '\0';
4454                         break;
4455                 }
4456         }
4457         return (newpath);
4458 }
4459 
4460 /*
4461  * This sets up a new path in sv_path to do a lookup of the referral point.
4462  * If the path is /rp/aaa/bbb and the referral point is aaa,
4463  * this updates /rp/aaa. This path will be used to get referral
4464  * location.
4465  */
4466 static void
4467 setup_newsvpath(servinfo4_t *svp, int nth)
4468 {
4469         int num_slashes = 0, pathlen, i = 0;
4470         char *newpath, *p;
4471 
4472         newpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
4473         for (p = svp->sv_path; *p; p++) {
4474                 newpath[i] =  *p;
4475                 if (*p == '/')
4476                         num_slashes++;
4477                 if (num_slashes == nth + 1) {
4478                         newpath[i] = '\0';
4479                         pathlen = strlen(newpath) + 1;
4480                         kmem_free(svp->sv_path, svp->sv_pathlen);
4481                         svp->sv_path = kmem_alloc(pathlen, KM_SLEEP);
4482                         svp->sv_pathlen = pathlen;
4483                         bcopy(newpath, svp->sv_path, pathlen);
4484                         break;
4485                 }
4486                 i++;
4487         }
4488         kmem_free(newpath, MAXPATHLEN);
4489 }