1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2016 Nexenta Systems, Inc. All rights reserved. 24 * Copyright (c) 2016 by Delphix. All rights reserved. 25 */ 26 27 /* 28 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 29 * Use is subject to license terms. 30 */ 31 32 /* 33 * Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T 34 * All Rights Reserved 35 */ 36 37 /* 38 * Portions of this source code were derived from Berkeley 4.3 BSD 39 * under license from the Regents of the University of California. 40 */ 41 42 43 /* 44 * Implements a kernel based, client side RPC over Connection Oriented 45 * Transports (COTS). 46 */ 47 48 /* 49 * Much of this file has been re-written to let NFS work better over slow 50 * transports. A description follows. 51 * 52 * One of the annoying things about kRPC/COTS is that it will temporarily 53 * create more than one connection between a client and server. This 54 * happens because when a connection is made, the end-points entry in the 55 * linked list of connections (headed by cm_hd), is removed so that other 56 * threads don't mess with it. Went ahead and bit the bullet by keeping 57 * the endpoint on the connection list and introducing state bits, 58 * condition variables etc. to the connection entry data structure (struct 59 * cm_xprt). 60 * 61 * Here is a summary of the changes to cm-xprt: 62 * 63 * x_ctime is the timestamp of when the endpoint was last 64 * connected or disconnected. If an end-point is ever disconnected 65 * or re-connected, then any outstanding RPC request is presumed 66 * lost, telling clnt_cots_kcallit that it needs to re-send the 67 * request, not just wait for the original request's reply to 68 * arrive. 69 * 70 * x_thread flag which tells us if a thread is doing a connection attempt. 71 * 72 * x_waitdis flag which tells us we are waiting a disconnect ACK. 73 * 74 * x_needdis flag which tells us we need to send a T_DISCONN_REQ 75 * to kill the connection. 76 * 77 * x_needrel flag which tells us we need to send a T_ORDREL_REQ to 78 * gracefully close the connection. 79 * 80 * #defined bitmasks for the all the b_* bits so that more 81 * efficient (and at times less clumsy) masks can be used to 82 * manipulated state in cases where multiple bits have to 83 * set/cleared/checked in the same critical section. 84 * 85 * x_conn_cv and x_dis-_cv are new condition variables to let 86 * threads knows when the connection attempt is done, and to let 87 * the connecting thread know when the disconnect handshake is 88 * done. 89 * 90 * Added the CONN_HOLD() macro so that all reference holds have the same 91 * look and feel. 92 * 93 * In the private (cku_private) portion of the client handle, 94 * 95 * cku_flags replaces the cku_sent a boolean. cku_flags keeps 96 * track of whether a request as been sent, and whether the 97 * client's handles call record is on the dispatch list (so that 98 * the reply can be matched by XID to the right client handle). 99 * The idea of CKU_ONQUEUE is that we can exit clnt_cots_kcallit() 100 * and still have the response find the right client handle so 101 * that the retry of CLNT_CALL() gets the result. Testing, found 102 * situations where if the timeout was increased, performance 103 * degraded. This was due to us hitting a window where the thread 104 * was back in rfscall() (probably printing server not responding) 105 * while the response came back but no place to put it. 106 * 107 * cku_ctime is just a cache of x_ctime. If they match, 108 * clnt_cots_kcallit() won't to send a retry (unless the maximum 109 * receive count limit as been reached). If the don't match, then 110 * we assume the request has been lost, and a retry of the request 111 * is needed. 112 * 113 * cku_recv_attempts counts the number of receive count attempts 114 * after one try is sent on the wire. 115 * 116 * Added the clnt_delay() routine so that interruptible and 117 * noninterruptible delays are possible. 118 * 119 * CLNT_MIN_TIMEOUT has been bumped to 10 seconds from 3. This is used to 120 * control how long the client delays before returned after getting 121 * ECONNREFUSED. At 3 seconds, 8 client threads per mount really does bash 122 * a server that may be booting and not yet started nfsd. 123 * 124 * CLNT_MAXRECV_WITHOUT_RETRY is a new macro (value of 3) (with a tunable) 125 * Why don't we just wait forever (receive an infinite # of times)? 126 * Because the server may have rebooted. More insidious is that some 127 * servers (ours) will drop NFS/TCP requests in some cases. This is bad, 128 * but it is a reality. 129 * 130 * The case of a server doing orderly release really messes up the 131 * client's recovery, especially if the server's TCP implementation is 132 * buggy. It was found was that the kRPC/COTS client was breaking some 133 * TPI rules, such as not waiting for the acknowledgement of a 134 * T_DISCON_REQ (hence the added case statements T_ERROR_ACK, T_OK_ACK and 135 * T_DISCON_REQ in clnt_dispatch_notifyall()). 136 * 137 * One of things that we've seen is that a kRPC TCP endpoint goes into 138 * TIMEWAIT and a thus a reconnect takes a long time to satisfy because 139 * that the TIMEWAIT state takes a while to finish. If a server sends a 140 * T_ORDREL_IND, there is little point in an RPC client doing a 141 * T_ORDREL_REQ, because the RPC request isn't going to make it (the 142 * server is saying that it won't accept any more data). So kRPC was 143 * changed to send a T_DISCON_REQ when we get a T_ORDREL_IND. So now the 144 * connection skips the TIMEWAIT state and goes straight to a bound state 145 * that kRPC can quickly switch to connected. 146 * 147 * Code that issues TPI request must use waitforack() to wait for the 148 * corresponding ack (assuming there is one) in any future modifications. 149 * This works around problems that may be introduced by breaking TPI rules 150 * (by submitting new calls before earlier requests have been acked) in the 151 * case of a signal or other early return. waitforack() depends on 152 * clnt_dispatch_notifyconn() to issue the wakeup when the ack 153 * arrives, so adding new TPI calls may require corresponding changes 154 * to clnt_dispatch_notifyconn(). Presently, the timeout period is based on 155 * CLNT_MIN_TIMEOUT which is 10 seconds. If you modify this value, be sure 156 * not to set it too low or TPI ACKS will be lost. 157 */ 158 159 #include <sys/param.h> 160 #include <sys/types.h> 161 #include <sys/user.h> 162 #include <sys/systm.h> 163 #include <sys/sysmacros.h> 164 #include <sys/proc.h> 165 #include <sys/socket.h> 166 #include <sys/file.h> 167 #include <sys/stream.h> 168 #include <sys/strsubr.h> 169 #include <sys/stropts.h> 170 #include <sys/strsun.h> 171 #include <sys/timod.h> 172 #include <sys/tiuser.h> 173 #include <sys/tihdr.h> 174 #include <sys/t_kuser.h> 175 #include <sys/fcntl.h> 176 #include <sys/errno.h> 177 #include <sys/kmem.h> 178 #include <sys/debug.h> 179 #include <sys/systm.h> 180 #include <sys/kstat.h> 181 #include <sys/t_lock.h> 182 #include <sys/ddi.h> 183 #include <sys/cmn_err.h> 184 #include <sys/time.h> 185 #include <sys/isa_defs.h> 186 #include <sys/callb.h> 187 #include <sys/sunddi.h> 188 #include <sys/atomic.h> 189 #include <sys/sdt.h> 190 191 #include <netinet/in.h> 192 #include <netinet/tcp.h> 193 194 #include <rpc/types.h> 195 #include <rpc/xdr.h> 196 #include <rpc/auth.h> 197 #include <rpc/clnt.h> 198 #include <rpc/rpc_msg.h> 199 200 #define COTS_DEFAULT_ALLOCSIZE 2048 201 202 #define WIRE_HDR_SIZE 20 /* serialized call header, sans proc number */ 203 #define MSG_OFFSET 128 /* offset of call into the mblk */ 204 205 const char *kinet_ntop6(uchar_t *, char *, size_t); 206 207 static int clnt_cots_ksettimers(CLIENT *, struct rpc_timers *, 208 struct rpc_timers *, int, void(*)(int, int, caddr_t), caddr_t, uint32_t); 209 static enum clnt_stat clnt_cots_kcallit(CLIENT *, rpcproc_t, xdrproc_t, 210 caddr_t, xdrproc_t, caddr_t, struct timeval); 211 static void clnt_cots_kabort(CLIENT *); 212 static void clnt_cots_kerror(CLIENT *, struct rpc_err *); 213 static bool_t clnt_cots_kfreeres(CLIENT *, xdrproc_t, caddr_t); 214 static void clnt_cots_kdestroy(CLIENT *); 215 static bool_t clnt_cots_kcontrol(CLIENT *, int, char *); 216 217 218 /* List of transports managed by the connection manager. */ 219 struct cm_xprt { 220 TIUSER *x_tiptr; /* transport handle */ 221 queue_t *x_wq; /* send queue */ 222 clock_t x_time; /* last time we handed this xprt out */ 223 clock_t x_ctime; /* time we went to CONNECTED */ 224 int x_tidu_size; /* TIDU size of this transport */ 225 union { 226 struct { 227 unsigned int 228 #ifdef _BIT_FIELDS_HTOL 229 b_closing: 1, /* we've sent a ord rel on this conn */ 230 b_dead: 1, /* transport is closed or disconn */ 231 b_doomed: 1, /* too many conns, let this go idle */ 232 b_connected: 1, /* this connection is connected */ 233 234 b_ordrel: 1, /* do an orderly release? */ 235 b_thread: 1, /* thread doing connect */ 236 b_waitdis: 1, /* waiting for disconnect ACK */ 237 b_needdis: 1, /* need T_DISCON_REQ */ 238 239 b_needrel: 1, /* need T_ORDREL_REQ */ 240 b_early_disc: 1, /* got a T_ORDREL_IND or T_DISCON_IND */ 241 /* disconnect during connect */ 242 243 b_pad: 22; 244 245 #endif 246 247 #ifdef _BIT_FIELDS_LTOH 248 b_pad: 22, 249 250 b_early_disc: 1, /* got a T_ORDREL_IND or T_DISCON_IND */ 251 /* disconnect during connect */ 252 b_needrel: 1, /* need T_ORDREL_REQ */ 253 254 b_needdis: 1, /* need T_DISCON_REQ */ 255 b_waitdis: 1, /* waiting for disconnect ACK */ 256 b_thread: 1, /* thread doing connect */ 257 b_ordrel: 1, /* do an orderly release? */ 258 259 b_connected: 1, /* this connection is connected */ 260 b_doomed: 1, /* too many conns, let this go idle */ 261 b_dead: 1, /* transport is closed or disconn */ 262 b_closing: 1; /* we've sent a ord rel on this conn */ 263 #endif 264 } bit; unsigned int word; 265 266 #define x_closing x_state.bit.b_closing 267 #define x_dead x_state.bit.b_dead 268 #define x_doomed x_state.bit.b_doomed 269 #define x_connected x_state.bit.b_connected 270 271 #define x_ordrel x_state.bit.b_ordrel 272 #define x_thread x_state.bit.b_thread 273 #define x_waitdis x_state.bit.b_waitdis 274 #define x_needdis x_state.bit.b_needdis 275 276 #define x_needrel x_state.bit.b_needrel 277 #define x_early_disc x_state.bit.b_early_disc 278 279 #define x_state_flags x_state.word 280 281 #define X_CLOSING 0x80000000 282 #define X_DEAD 0x40000000 283 #define X_DOOMED 0x20000000 284 #define X_CONNECTED 0x10000000 285 286 #define X_ORDREL 0x08000000 287 #define X_THREAD 0x04000000 288 #define X_WAITDIS 0x02000000 289 #define X_NEEDDIS 0x01000000 290 291 #define X_NEEDREL 0x00800000 292 #define X_EARLYDISC 0x00400000 293 294 #define X_BADSTATES (X_CLOSING | X_DEAD | X_DOOMED) 295 296 } x_state; 297 int x_ref; /* number of users of this xprt */ 298 int x_family; /* address family of transport */ 299 dev_t x_rdev; /* device number of transport */ 300 struct cm_xprt *x_next; 301 302 struct netbuf x_server; /* destination address */ 303 struct netbuf x_src; /* src address (for retries) */ 304 kmutex_t x_lock; /* lock on this entry */ 305 kcondvar_t x_cv; /* to signal when can be closed */ 306 kcondvar_t x_conn_cv; /* to signal when connection attempt */ 307 /* is complete */ 308 kstat_t *x_ksp; 309 310 kcondvar_t x_dis_cv; /* to signal when disconnect attempt */ 311 /* is complete */ 312 zoneid_t x_zoneid; /* zone this xprt belongs to */ 313 }; 314 315 typedef struct cm_kstat_xprt { 316 kstat_named_t x_wq; 317 kstat_named_t x_server; 318 kstat_named_t x_family; 319 kstat_named_t x_rdev; 320 kstat_named_t x_time; 321 kstat_named_t x_state; 322 kstat_named_t x_ref; 323 kstat_named_t x_port; 324 } cm_kstat_xprt_t; 325 326 static cm_kstat_xprt_t cm_kstat_template = { 327 { "write_queue", KSTAT_DATA_UINT32 }, 328 { "server", KSTAT_DATA_STRING }, 329 { "addr_family", KSTAT_DATA_UINT32 }, 330 { "device", KSTAT_DATA_UINT32 }, 331 { "time_stamp", KSTAT_DATA_UINT32 }, 332 { "status", KSTAT_DATA_UINT32 }, 333 { "ref_count", KSTAT_DATA_INT32 }, 334 { "port", KSTAT_DATA_UINT32 }, 335 }; 336 337 /* 338 * The inverse of this is connmgr_release(). 339 */ 340 #define CONN_HOLD(Cm_entry) {\ 341 mutex_enter(&(Cm_entry)->x_lock); \ 342 (Cm_entry)->x_ref++; \ 343 mutex_exit(&(Cm_entry)->x_lock); \ 344 } 345 346 347 /* 348 * Private data per rpc handle. This structure is allocated by 349 * clnt_cots_kcreate, and freed by clnt_cots_kdestroy. 350 */ 351 typedef struct cku_private_s { 352 CLIENT cku_client; /* client handle */ 353 calllist_t cku_call; /* for dispatching calls */ 354 struct rpc_err cku_err; /* error status */ 355 356 struct netbuf cku_srcaddr; /* source address for retries */ 357 int cku_addrfmly; /* for binding port */ 358 struct netbuf cku_addr; /* remote address */ 359 dev_t cku_device; /* device to use */ 360 uint_t cku_flags; 361 #define CKU_ONQUEUE 0x1 362 #define CKU_SENT 0x2 363 364 bool_t cku_progress; /* for CLSET_PROGRESS */ 365 uint32_t cku_xid; /* current XID */ 366 clock_t cku_ctime; /* time stamp of when */ 367 /* connection was created */ 368 uint_t cku_recv_attempts; 369 XDR cku_outxdr; /* xdr routine for output */ 370 XDR cku_inxdr; /* xdr routine for input */ 371 char cku_rpchdr[WIRE_HDR_SIZE + 4]; 372 /* pre-serialized rpc header */ 373 374 uint_t cku_outbuflen; /* default output mblk length */ 375 struct cred *cku_cred; /* credentials */ 376 bool_t cku_nodelayonerr; 377 /* for CLSET_NODELAYONERR */ 378 int cku_useresvport; /* Use reserved port */ 379 struct rpc_cots_client *cku_stats; /* stats for zone */ 380 } cku_private_t; 381 382 static struct cm_xprt *connmgr_wrapconnect(struct cm_xprt *, 383 const struct timeval *, struct netbuf *, int, struct netbuf *, 384 struct rpc_err *, bool_t, bool_t, cred_t *); 385 386 static bool_t connmgr_connect(struct cm_xprt *, queue_t *, struct netbuf *, 387 int, calllist_t *, int *, bool_t reconnect, 388 const struct timeval *, bool_t, cred_t *); 389 390 static void *connmgr_opt_getoff(mblk_t *mp, t_uscalar_t offset, 391 t_uscalar_t length, uint_t align_size); 392 static bool_t connmgr_setbufsz(calllist_t *e, queue_t *wq, cred_t *cr); 393 static bool_t connmgr_getopt_int(queue_t *wq, int level, int name, int *val, 394 calllist_t *e, cred_t *cr); 395 static bool_t connmgr_setopt_int(queue_t *wq, int level, int name, int val, 396 calllist_t *e, cred_t *cr); 397 static bool_t connmgr_setopt(queue_t *, int, int, calllist_t *, cred_t *cr); 398 static void connmgr_sndrel(struct cm_xprt *); 399 static void connmgr_snddis(struct cm_xprt *); 400 static void connmgr_close(struct cm_xprt *); 401 static void connmgr_release(struct cm_xprt *); 402 static struct cm_xprt *connmgr_wrapget(struct netbuf *, const struct timeval *, 403 cku_private_t *); 404 405 static struct cm_xprt *connmgr_get(struct netbuf *, const struct timeval *, 406 struct netbuf *, int, struct netbuf *, struct rpc_err *, dev_t, 407 bool_t, int, cred_t *); 408 409 static void connmgr_cancelconn(struct cm_xprt *); 410 static enum clnt_stat connmgr_cwait(struct cm_xprt *, const struct timeval *, 411 bool_t); 412 static void connmgr_dis_and_wait(struct cm_xprt *); 413 414 static int clnt_dispatch_send(queue_t *, mblk_t *, calllist_t *, uint_t, 415 uint_t); 416 417 static int clnt_delay(clock_t, bool_t); 418 419 static int waitforack(calllist_t *, t_scalar_t, const struct timeval *, bool_t); 420 421 /* 422 * Operations vector for TCP/IP based RPC 423 */ 424 static struct clnt_ops tcp_ops = { 425 clnt_cots_kcallit, /* do rpc call */ 426 clnt_cots_kabort, /* abort call */ 427 clnt_cots_kerror, /* return error status */ 428 clnt_cots_kfreeres, /* free results */ 429 clnt_cots_kdestroy, /* destroy rpc handle */ 430 clnt_cots_kcontrol, /* the ioctl() of rpc */ 431 clnt_cots_ksettimers, /* set retry timers */ 432 }; 433 434 static int rpc_kstat_instance = 0; /* keeps the current instance */ 435 /* number for the next kstat_create */ 436 437 static struct cm_xprt *cm_hd = NULL; 438 static kmutex_t connmgr_lock; /* for connection mngr's list of transports */ 439 440 extern kmutex_t clnt_max_msg_lock; 441 442 static calllist_t *clnt_pending = NULL; 443 extern kmutex_t clnt_pending_lock; 444 445 static int clnt_cots_hash_size = DEFAULT_HASH_SIZE; 446 447 static call_table_t *cots_call_ht; 448 449 static const struct rpc_cots_client { 450 kstat_named_t rccalls; 451 kstat_named_t rcbadcalls; 452 kstat_named_t rcbadxids; 453 kstat_named_t rctimeouts; 454 kstat_named_t rcnewcreds; 455 kstat_named_t rcbadverfs; 456 kstat_named_t rctimers; 457 kstat_named_t rccantconn; 458 kstat_named_t rcnomem; 459 kstat_named_t rcintrs; 460 } cots_rcstat_tmpl = { 461 { "calls", KSTAT_DATA_UINT64 }, 462 { "badcalls", KSTAT_DATA_UINT64 }, 463 { "badxids", KSTAT_DATA_UINT64 }, 464 { "timeouts", KSTAT_DATA_UINT64 }, 465 { "newcreds", KSTAT_DATA_UINT64 }, 466 { "badverfs", KSTAT_DATA_UINT64 }, 467 { "timers", KSTAT_DATA_UINT64 }, 468 { "cantconn", KSTAT_DATA_UINT64 }, 469 { "nomem", KSTAT_DATA_UINT64 }, 470 { "interrupts", KSTAT_DATA_UINT64 } 471 }; 472 473 #define COTSRCSTAT_INCR(p, x) \ 474 atomic_inc_64(&(p)->x.value.ui64) 475 476 #define CLNT_MAX_CONNS 1 /* concurrent connections between clnt/srvr */ 477 int clnt_max_conns = CLNT_MAX_CONNS; 478 479 #define CLNT_MIN_TIMEOUT 10 /* seconds to wait after we get a */ 480 /* connection reset */ 481 #define CLNT_MIN_CONNTIMEOUT 5 /* seconds to wait for a connection */ 482 483 484 int clnt_cots_min_tout = CLNT_MIN_TIMEOUT; 485 int clnt_cots_min_conntout = CLNT_MIN_CONNTIMEOUT; 486 487 /* 488 * Limit the number of times we will attempt to receive a reply without 489 * re-sending a response. 490 */ 491 #define CLNT_MAXRECV_WITHOUT_RETRY 3 492 uint_t clnt_cots_maxrecv = CLNT_MAXRECV_WITHOUT_RETRY; 493 494 uint_t *clnt_max_msg_sizep; 495 void (*clnt_stop_idle)(queue_t *wq); 496 497 #define ptoh(p) (&((p)->cku_client)) 498 #define htop(h) ((cku_private_t *)((h)->cl_private)) 499 500 /* 501 * Times to retry 502 */ 503 #define REFRESHES 2 /* authentication refreshes */ 504 505 /* 506 * The following is used to determine the global default behavior for 507 * COTS when binding to a local port. 508 * 509 * If the value is set to 1 the default will be to select a reserved 510 * (aka privileged) port, if the value is zero the default will be to 511 * use non-reserved ports. Users of kRPC may override this by using 512 * CLNT_CONTROL() and CLSET_BINDRESVPORT. 513 */ 514 int clnt_cots_do_bindresvport = 1; 515 516 static zone_key_t zone_cots_key; 517 518 /* 519 * Defaults TCP send and receive buffer size for RPC connections. 520 * These values can be tuned by /etc/system. 521 */ 522 int rpc_send_bufsz = 1024*1024; 523 int rpc_recv_bufsz = 1024*1024; 524 /* 525 * To use system-wide default for TCP send and receive buffer size, 526 * use /etc/system to set rpc_default_tcp_bufsz to 1: 527 * 528 * set rpcmod:rpc_default_tcp_bufsz=1 529 */ 530 int rpc_default_tcp_bufsz = 0; 531 532 /* 533 * We need to do this after all kernel threads in the zone have exited. 534 */ 535 /* ARGSUSED */ 536 static void 537 clnt_zone_destroy(zoneid_t zoneid, void *unused) 538 { 539 struct cm_xprt **cmp; 540 struct cm_xprt *cm_entry; 541 struct cm_xprt *freelist = NULL; 542 543 mutex_enter(&connmgr_lock); 544 cmp = &cm_hd; 545 while ((cm_entry = *cmp) != NULL) { 546 if (cm_entry->x_zoneid == zoneid) { 547 *cmp = cm_entry->x_next; 548 cm_entry->x_next = freelist; 549 freelist = cm_entry; 550 } else { 551 cmp = &cm_entry->x_next; 552 } 553 } 554 mutex_exit(&connmgr_lock); 555 while ((cm_entry = freelist) != NULL) { 556 freelist = cm_entry->x_next; 557 connmgr_close(cm_entry); 558 } 559 } 560 561 int 562 clnt_cots_kcreate(dev_t dev, struct netbuf *addr, int family, rpcprog_t prog, 563 rpcvers_t vers, uint_t max_msgsize, cred_t *cred, CLIENT **ncl) 564 { 565 CLIENT *h; 566 cku_private_t *p; 567 struct rpc_msg call_msg; 568 struct rpcstat *rpcstat; 569 570 RPCLOG(8, "clnt_cots_kcreate: prog %u\n", prog); 571 572 rpcstat = zone_getspecific(rpcstat_zone_key, rpc_zone()); 573 ASSERT(rpcstat != NULL); 574 575 /* Allocate and intialize the client handle. */ 576 p = kmem_zalloc(sizeof (*p), KM_SLEEP); 577 578 h = ptoh(p); 579 580 h->cl_private = (caddr_t)p; 581 h->cl_auth = authkern_create(); 582 h->cl_ops = &tcp_ops; 583 584 cv_init(&p->cku_call.call_cv, NULL, CV_DEFAULT, NULL); 585 mutex_init(&p->cku_call.call_lock, NULL, MUTEX_DEFAULT, NULL); 586 587 /* 588 * If the current sanity check size in rpcmod is smaller 589 * than the size needed, then increase the sanity check. 590 */ 591 if (max_msgsize != 0 && clnt_max_msg_sizep != NULL && 592 max_msgsize > *clnt_max_msg_sizep) { 593 mutex_enter(&clnt_max_msg_lock); 594 if (max_msgsize > *clnt_max_msg_sizep) 595 *clnt_max_msg_sizep = max_msgsize; 596 mutex_exit(&clnt_max_msg_lock); 597 } 598 599 p->cku_outbuflen = COTS_DEFAULT_ALLOCSIZE; 600 601 /* Preserialize the call message header */ 602 603 call_msg.rm_xid = 0; 604 call_msg.rm_direction = CALL; 605 call_msg.rm_call.cb_rpcvers = RPC_MSG_VERSION; 606 call_msg.rm_call.cb_prog = prog; 607 call_msg.rm_call.cb_vers = vers; 608 609 xdrmem_create(&p->cku_outxdr, p->cku_rpchdr, WIRE_HDR_SIZE, XDR_ENCODE); 610 611 if (!xdr_callhdr(&p->cku_outxdr, &call_msg)) { 612 XDR_DESTROY(&p->cku_outxdr); 613 RPCLOG0(1, "clnt_cots_kcreate - Fatal header serialization " 614 "error\n"); 615 auth_destroy(h->cl_auth); 616 kmem_free(p, sizeof (cku_private_t)); 617 RPCLOG0(1, "clnt_cots_kcreate: create failed error EINVAL\n"); 618 return (EINVAL); /* XXX */ 619 } 620 XDR_DESTROY(&p->cku_outxdr); 621 622 /* 623 * The zalloc initialized the fields below. 624 * p->cku_xid = 0; 625 * p->cku_flags = 0; 626 * p->cku_srcaddr.len = 0; 627 * p->cku_srcaddr.maxlen = 0; 628 */ 629 630 p->cku_cred = cred; 631 p->cku_device = dev; 632 p->cku_addrfmly = family; 633 p->cku_addr.buf = kmem_zalloc(addr->maxlen, KM_SLEEP); 634 p->cku_addr.maxlen = addr->maxlen; 635 p->cku_addr.len = addr->len; 636 bcopy(addr->buf, p->cku_addr.buf, addr->len); 637 p->cku_stats = rpcstat->rpc_cots_client; 638 p->cku_useresvport = -1; /* value is has not been set */ 639 640 *ncl = h; 641 return (0); 642 } 643 644 /*ARGSUSED*/ 645 static void 646 clnt_cots_kabort(CLIENT *h) 647 { 648 } 649 650 /* 651 * Return error info on this handle. 652 */ 653 static void 654 clnt_cots_kerror(CLIENT *h, struct rpc_err *err) 655 { 656 /* LINTED pointer alignment */ 657 cku_private_t *p = htop(h); 658 659 *err = p->cku_err; 660 } 661 662 /*ARGSUSED*/ 663 static bool_t 664 clnt_cots_kfreeres(CLIENT *h, xdrproc_t xdr_res, caddr_t res_ptr) 665 { 666 xdr_free(xdr_res, res_ptr); 667 668 return (TRUE); 669 } 670 671 static bool_t 672 clnt_cots_kcontrol(CLIENT *h, int cmd, char *arg) 673 { 674 cku_private_t *p = htop(h); 675 676 switch (cmd) { 677 case CLSET_PROGRESS: 678 p->cku_progress = TRUE; 679 return (TRUE); 680 681 case CLSET_XID: 682 if (arg == NULL) 683 return (FALSE); 684 685 p->cku_xid = *((uint32_t *)arg); 686 return (TRUE); 687 688 case CLGET_XID: 689 if (arg == NULL) 690 return (FALSE); 691 692 *((uint32_t *)arg) = p->cku_xid; 693 return (TRUE); 694 695 case CLSET_NODELAYONERR: 696 if (arg == NULL) 697 return (FALSE); 698 699 if (*((bool_t *)arg) == TRUE) { 700 p->cku_nodelayonerr = TRUE; 701 return (TRUE); 702 } 703 if (*((bool_t *)arg) == FALSE) { 704 p->cku_nodelayonerr = FALSE; 705 return (TRUE); 706 } 707 return (FALSE); 708 709 case CLGET_NODELAYONERR: 710 if (arg == NULL) 711 return (FALSE); 712 713 *((bool_t *)arg) = p->cku_nodelayonerr; 714 return (TRUE); 715 716 case CLSET_BINDRESVPORT: 717 if (arg == NULL) 718 return (FALSE); 719 720 if (*(int *)arg != 1 && *(int *)arg != 0) 721 return (FALSE); 722 723 p->cku_useresvport = *(int *)arg; 724 725 return (TRUE); 726 727 case CLGET_BINDRESVPORT: 728 if (arg == NULL) 729 return (FALSE); 730 731 *(int *)arg = p->cku_useresvport; 732 733 return (TRUE); 734 735 default: 736 return (FALSE); 737 } 738 } 739 740 /* 741 * Destroy rpc handle. Frees the space used for output buffer, 742 * private data, and handle structure. 743 */ 744 static void 745 clnt_cots_kdestroy(CLIENT *h) 746 { 747 /* LINTED pointer alignment */ 748 cku_private_t *p = htop(h); 749 calllist_t *call = &p->cku_call; 750 751 RPCLOG(8, "clnt_cots_kdestroy h: %p\n", (void *)h); 752 RPCLOG(8, "clnt_cots_kdestroy h: xid=0x%x\n", p->cku_xid); 753 754 if (p->cku_flags & CKU_ONQUEUE) { 755 RPCLOG(64, "clnt_cots_kdestroy h: removing call for xid 0x%x " 756 "from dispatch list\n", p->cku_xid); 757 call_table_remove(call); 758 } 759 760 if (call->call_reply) 761 freemsg(call->call_reply); 762 cv_destroy(&call->call_cv); 763 mutex_destroy(&call->call_lock); 764 765 kmem_free(p->cku_srcaddr.buf, p->cku_srcaddr.maxlen); 766 kmem_free(p->cku_addr.buf, p->cku_addr.maxlen); 767 kmem_free(p, sizeof (*p)); 768 } 769 770 static int clnt_cots_pulls; 771 #define RM_HDR_SIZE 4 /* record mark header size */ 772 773 /* 774 * Call remote procedure. 775 */ 776 static enum clnt_stat 777 clnt_cots_kcallit(CLIENT *h, rpcproc_t procnum, xdrproc_t xdr_args, 778 caddr_t argsp, xdrproc_t xdr_results, caddr_t resultsp, struct timeval wait) 779 { 780 /* LINTED pointer alignment */ 781 cku_private_t *p = htop(h); 782 calllist_t *call = &p->cku_call; 783 XDR *xdrs; 784 struct rpc_msg reply_msg; 785 mblk_t *mp; 786 #ifdef RPCDEBUG 787 clock_t time_sent; 788 #endif 789 struct netbuf *retryaddr; 790 struct cm_xprt *cm_entry = NULL; 791 queue_t *wq; 792 int len, waitsecs, max_waitsecs; 793 int mpsize; 794 int refreshes = REFRESHES; 795 int interrupted; 796 int tidu_size; 797 enum clnt_stat status; 798 struct timeval cwait; 799 bool_t delay_first = FALSE; 800 clock_t ticks, now; 801 802 RPCLOG(2, "clnt_cots_kcallit, procnum %u\n", procnum); 803 COTSRCSTAT_INCR(p->cku_stats, rccalls); 804 805 RPCLOG(2, "clnt_cots_kcallit: wait.tv_sec: %ld\n", wait.tv_sec); 806 RPCLOG(2, "clnt_cots_kcallit: wait.tv_usec: %ld\n", wait.tv_usec); 807 /* 808 * Bug ID 1240234: 809 * Look out for zero length timeouts. We don't want to 810 * wait zero seconds for a connection to be established. 811 */ 812 if (wait.tv_sec < clnt_cots_min_conntout) { 813 cwait.tv_sec = clnt_cots_min_conntout; 814 cwait.tv_usec = 0; 815 RPCLOG(8, "clnt_cots_kcallit: wait.tv_sec (%ld) too low,", 816 wait.tv_sec); 817 RPCLOG(8, " setting to: %d\n", clnt_cots_min_conntout); 818 } else { 819 cwait = wait; 820 } 821 822 call_again: 823 if (cm_entry) { 824 connmgr_release(cm_entry); 825 cm_entry = NULL; 826 } 827 828 mp = NULL; 829 830 /* 831 * If the call is not a retry, allocate a new xid and cache it 832 * for future retries. 833 * Bug ID 1246045: 834 * Treat call as a retry for purposes of binding the source 835 * port only if we actually attempted to send anything on 836 * the previous call. 837 */ 838 if (p->cku_xid == 0) { 839 p->cku_xid = alloc_xid(); 840 call->call_zoneid = rpc_zoneid(); 841 842 /* 843 * We need to ASSERT here that our xid != 0 because this 844 * determines whether or not our call record gets placed on 845 * the hash table or the linked list. By design, we mandate 846 * that RPC calls over cots must have xid's != 0, so we can 847 * ensure proper management of the hash table. 848 */ 849 ASSERT(p->cku_xid != 0); 850 851 retryaddr = NULL; 852 p->cku_flags &= ~CKU_SENT; 853 854 if (p->cku_flags & CKU_ONQUEUE) { 855 RPCLOG(8, "clnt_cots_kcallit: new call, dequeuing old" 856 " one (%p)\n", (void *)call); 857 call_table_remove(call); 858 p->cku_flags &= ~CKU_ONQUEUE; 859 RPCLOG(64, "clnt_cots_kcallit: removing call from " 860 "dispatch list because xid was zero (now 0x%x)\n", 861 p->cku_xid); 862 } 863 864 if (call->call_reply != NULL) { 865 freemsg(call->call_reply); 866 call->call_reply = NULL; 867 } 868 } else if (p->cku_srcaddr.buf == NULL || p->cku_srcaddr.len == 0) { 869 retryaddr = NULL; 870 871 } else if (p->cku_flags & CKU_SENT) { 872 retryaddr = &p->cku_srcaddr; 873 874 } else { 875 /* 876 * Bug ID 1246045: Nothing was sent, so set retryaddr to 877 * NULL and let connmgr_get() bind to any source port it 878 * can get. 879 */ 880 retryaddr = NULL; 881 } 882 883 RPCLOG(64, "clnt_cots_kcallit: xid = 0x%x", p->cku_xid); 884 RPCLOG(64, " flags = 0x%x\n", p->cku_flags); 885 886 p->cku_err.re_status = RPC_TIMEDOUT; 887 p->cku_err.re_errno = p->cku_err.re_terrno = 0; 888 889 cm_entry = connmgr_wrapget(retryaddr, &cwait, p); 890 891 if (cm_entry == NULL) { 892 RPCLOG(1, "clnt_cots_kcallit: can't connect status %s\n", 893 clnt_sperrno(p->cku_err.re_status)); 894 895 /* 896 * The reasons why we fail to create a connection are 897 * varied. In most cases we don't want the caller to 898 * immediately retry. This could have one or more 899 * bad effects. This includes flooding the net with 900 * connect requests to ports with no listener; a hard 901 * kernel loop due to all the "reserved" TCP ports being 902 * in use. 903 */ 904 delay_first = TRUE; 905 906 /* 907 * Even if we end up returning EINTR, we still count a 908 * a "can't connect", because the connection manager 909 * might have been committed to waiting for or timing out on 910 * a connection. 911 */ 912 COTSRCSTAT_INCR(p->cku_stats, rccantconn); 913 switch (p->cku_err.re_status) { 914 case RPC_INTR: 915 p->cku_err.re_errno = EINTR; 916 917 /* 918 * No need to delay because a UNIX signal(2) 919 * interrupted us. The caller likely won't 920 * retry the CLNT_CALL() and even if it does, 921 * we assume the caller knows what it is doing. 922 */ 923 delay_first = FALSE; 924 break; 925 926 case RPC_TIMEDOUT: 927 p->cku_err.re_errno = ETIMEDOUT; 928 929 /* 930 * No need to delay because timed out already 931 * on the connection request and assume that the 932 * transport time out is longer than our minimum 933 * timeout, or least not too much smaller. 934 */ 935 delay_first = FALSE; 936 break; 937 938 case RPC_SYSTEMERROR: 939 case RPC_TLIERROR: 940 /* 941 * We want to delay here because a transient 942 * system error has a better chance of going away 943 * if we delay a bit. If it's not transient, then 944 * we don't want end up in a hard kernel loop 945 * due to retries. 946 */ 947 ASSERT(p->cku_err.re_errno != 0); 948 break; 949 950 951 case RPC_CANTCONNECT: 952 /* 953 * RPC_CANTCONNECT is set on T_ERROR_ACK which 954 * implies some error down in the TCP layer or 955 * below. If cku_nodelayonerror is set then we 956 * assume the caller knows not to try too hard. 957 */ 958 RPCLOG0(8, "clnt_cots_kcallit: connection failed,"); 959 RPCLOG0(8, " re_status=RPC_CANTCONNECT,"); 960 RPCLOG(8, " re_errno=%d,", p->cku_err.re_errno); 961 RPCLOG(8, " cku_nodelayonerr=%d", p->cku_nodelayonerr); 962 if (p->cku_nodelayonerr == TRUE) 963 delay_first = FALSE; 964 965 p->cku_err.re_errno = EIO; 966 967 break; 968 969 case RPC_XPRTFAILED: 970 /* 971 * We want to delay here because we likely 972 * got a refused connection. 973 */ 974 if (p->cku_err.re_errno == 0) 975 p->cku_err.re_errno = EIO; 976 977 RPCLOG(1, "clnt_cots_kcallit: transport failed: %d\n", 978 p->cku_err.re_errno); 979 980 break; 981 982 default: 983 /* 984 * We delay here because it is better to err 985 * on the side of caution. If we got here then 986 * status could have been RPC_SUCCESS, but we 987 * know that we did not get a connection, so 988 * force the rpc status to RPC_CANTCONNECT. 989 */ 990 p->cku_err.re_status = RPC_CANTCONNECT; 991 p->cku_err.re_errno = EIO; 992 break; 993 } 994 if (delay_first == TRUE) 995 ticks = clnt_cots_min_tout * drv_usectohz(1000000); 996 goto cots_done; 997 } 998 999 /* 1000 * If we've never sent any request on this connection (send count 1001 * is zero, or the connection has been reset), cache the 1002 * the connection's create time and send a request (possibly a retry) 1003 */ 1004 if ((p->cku_flags & CKU_SENT) == 0 || 1005 p->cku_ctime != cm_entry->x_ctime) { 1006 p->cku_ctime = cm_entry->x_ctime; 1007 1008 } else if ((p->cku_flags & CKU_SENT) && (p->cku_flags & CKU_ONQUEUE) && 1009 (call->call_reply != NULL || 1010 p->cku_recv_attempts < clnt_cots_maxrecv)) { 1011 1012 /* 1013 * If we've sent a request and our call is on the dispatch 1014 * queue and we haven't made too many receive attempts, then 1015 * don't re-send, just receive. 1016 */ 1017 p->cku_recv_attempts++; 1018 goto read_again; 1019 } 1020 1021 /* 1022 * Now we create the RPC request in a STREAMS message. We have to do 1023 * this after the call to connmgr_get so that we have the correct 1024 * TIDU size for the transport. 1025 */ 1026 tidu_size = cm_entry->x_tidu_size; 1027 len = MSG_OFFSET + MAX(tidu_size, RM_HDR_SIZE + WIRE_HDR_SIZE); 1028 1029 while ((mp = allocb(len, BPRI_MED)) == NULL) { 1030 if (strwaitbuf(len, BPRI_MED)) { 1031 p->cku_err.re_status = RPC_SYSTEMERROR; 1032 p->cku_err.re_errno = ENOSR; 1033 COTSRCSTAT_INCR(p->cku_stats, rcnomem); 1034 goto cots_done; 1035 } 1036 } 1037 xdrs = &p->cku_outxdr; 1038 xdrmblk_init(xdrs, mp, XDR_ENCODE, tidu_size); 1039 mpsize = MBLKSIZE(mp); 1040 ASSERT(mpsize >= len); 1041 ASSERT(mp->b_rptr == mp->b_datap->db_base); 1042 1043 /* 1044 * If the size of mblk is not appreciably larger than what we 1045 * asked, then resize the mblk to exactly len bytes. The reason for 1046 * this: suppose len is 1600 bytes, the tidu is 1460 bytes 1047 * (from TCP over ethernet), and the arguments to the RPC require 1048 * 2800 bytes. Ideally we want the protocol to render two 1049 * ~1400 byte segments over the wire. However if allocb() gives us a 2k 1050 * mblk, and we allocate a second mblk for the remainder, the protocol 1051 * module may generate 3 segments over the wire: 1052 * 1460 bytes for the first, 448 (2048 - 1600) for the second, and 1053 * 892 for the third. If we "waste" 448 bytes in the first mblk, 1054 * the XDR encoding will generate two ~1400 byte mblks, and the 1055 * protocol module is more likely to produce properly sized segments. 1056 */ 1057 if ((mpsize >> 1) <= len) 1058 mp->b_rptr += (mpsize - len); 1059 1060 /* 1061 * Adjust b_rptr to reserve space for the non-data protocol headers 1062 * any downstream modules might like to add, and for the 1063 * record marking header. 1064 */ 1065 mp->b_rptr += (MSG_OFFSET + RM_HDR_SIZE); 1066 1067 if (h->cl_auth->ah_cred.oa_flavor != RPCSEC_GSS) { 1068 /* Copy in the preserialized RPC header information. */ 1069 bcopy(p->cku_rpchdr, mp->b_rptr, WIRE_HDR_SIZE); 1070 1071 /* Use XDR_SETPOS() to set the b_wptr to past the RPC header. */ 1072 XDR_SETPOS(xdrs, (uint_t)(mp->b_rptr - mp->b_datap->db_base + 1073 WIRE_HDR_SIZE)); 1074 1075 ASSERT((mp->b_wptr - mp->b_rptr) == WIRE_HDR_SIZE); 1076 1077 /* Serialize the procedure number and the arguments. */ 1078 if ((!XDR_PUTINT32(xdrs, (int32_t *)&procnum)) || 1079 (!AUTH_MARSHALL(h->cl_auth, xdrs, p->cku_cred)) || 1080 (!(*xdr_args)(xdrs, argsp))) { 1081 XDR_DESTROY(xdrs); 1082 p->cku_err.re_status = RPC_CANTENCODEARGS; 1083 p->cku_err.re_errno = EIO; 1084 goto cots_done; 1085 } 1086 1087 (*(uint32_t *)(mp->b_rptr)) = p->cku_xid; 1088 } else { 1089 uint32_t *uproc = (uint32_t *)&p->cku_rpchdr[WIRE_HDR_SIZE]; 1090 IXDR_PUT_U_INT32(uproc, procnum); 1091 1092 (*(uint32_t *)(&p->cku_rpchdr[0])) = p->cku_xid; 1093 1094 /* Use XDR_SETPOS() to set the b_wptr. */ 1095 XDR_SETPOS(xdrs, (uint_t)(mp->b_rptr - mp->b_datap->db_base)); 1096 1097 /* Serialize the procedure number and the arguments. */ 1098 if (!AUTH_WRAP(h->cl_auth, p->cku_rpchdr, WIRE_HDR_SIZE+4, 1099 xdrs, xdr_args, argsp)) { 1100 XDR_DESTROY(xdrs); 1101 p->cku_err.re_status = RPC_CANTENCODEARGS; 1102 p->cku_err.re_errno = EIO; 1103 goto cots_done; 1104 } 1105 } 1106 1107 XDR_DESTROY(xdrs); 1108 1109 RPCLOG(2, "clnt_cots_kcallit: connected, sending call, tidu_size %d\n", 1110 tidu_size); 1111 1112 wq = cm_entry->x_wq; 1113 waitsecs = 0; 1114 1115 dispatch_again: 1116 status = clnt_dispatch_send(wq, mp, call, p->cku_xid, 1117 (p->cku_flags & CKU_ONQUEUE)); 1118 1119 if ((status == RPC_CANTSEND) && (call->call_reason == ENOBUFS)) { 1120 /* 1121 * QFULL condition, allow some time for queue to drain 1122 * and try again. Give up after waiting for all timeout 1123 * specified for the call, or zone is going away. 1124 */ 1125 max_waitsecs = wait.tv_sec ? wait.tv_sec : clnt_cots_min_tout; 1126 if ((waitsecs++ < max_waitsecs) && 1127 !(zone_status_get(curproc->p_zone) >= 1128 ZONE_IS_SHUTTING_DOWN)) { 1129 1130 /* wait 1 sec for queue to drain */ 1131 if (clnt_delay(drv_usectohz(1000000), 1132 h->cl_nosignal) == EINTR) { 1133 p->cku_err.re_errno = EINTR; 1134 p->cku_err.re_status = RPC_INTR; 1135 1136 goto cots_done; 1137 } 1138 1139 /* and try again */ 1140 goto dispatch_again; 1141 } 1142 p->cku_err.re_status = status; 1143 p->cku_err.re_errno = call->call_reason; 1144 DTRACE_PROBE(krpc__e__clntcots__kcallit__cantsend); 1145 1146 goto cots_done; 1147 } 1148 1149 if (waitsecs) { 1150 /* adjust timeout to account for time wait to send */ 1151 wait.tv_sec -= waitsecs; 1152 if (wait.tv_sec < 0) { 1153 /* pick up reply on next retry */ 1154 wait.tv_sec = 0; 1155 } 1156 DTRACE_PROBE2(clnt_cots__sendwait, CLIENT *, h, 1157 int, waitsecs); 1158 } 1159 1160 RPCLOG(64, "clnt_cots_kcallit: sent call for xid 0x%x\n", 1161 (uint_t)p->cku_xid); 1162 p->cku_flags = (CKU_ONQUEUE|CKU_SENT); 1163 p->cku_recv_attempts = 1; 1164 1165 #ifdef RPCDEBUG 1166 time_sent = ddi_get_lbolt(); 1167 #endif 1168 1169 /* 1170 * Wait for a reply or a timeout. If there is no error or timeout, 1171 * (both indicated by call_status), call->call_reply will contain 1172 * the RPC reply message. 1173 */ 1174 read_again: 1175 mutex_enter(&call->call_lock); 1176 interrupted = 0; 1177 if (call->call_status == RPC_TIMEDOUT) { 1178 /* 1179 * Indicate that the lwp is not to be stopped while waiting 1180 * for this network traffic. This is to avoid deadlock while 1181 * debugging a process via /proc and also to avoid recursive 1182 * mutex_enter()s due to NFS page faults while stopping 1183 * (NFS holds locks when it calls here). 1184 */ 1185 clock_t cv_wait_ret; 1186 clock_t timout; 1187 clock_t oldlbolt; 1188 1189 klwp_t *lwp = ttolwp(curthread); 1190 1191 if (lwp != NULL) 1192 lwp->lwp_nostop++; 1193 1194 oldlbolt = ddi_get_lbolt(); 1195 timout = wait.tv_sec * drv_usectohz(1000000) + 1196 drv_usectohz(wait.tv_usec) + oldlbolt; 1197 /* 1198 * Iterate until the call_status is changed to something 1199 * other that RPC_TIMEDOUT, or if cv_timedwait_sig() returns 1200 * something <=0 zero. The latter means that we timed 1201 * out. 1202 */ 1203 if (h->cl_nosignal) 1204 while ((cv_wait_ret = cv_timedwait(&call->call_cv, 1205 &call->call_lock, timout)) > 0 && 1206 call->call_status == RPC_TIMEDOUT) 1207 ; 1208 else 1209 while ((cv_wait_ret = cv_timedwait_sig( 1210 &call->call_cv, 1211 &call->call_lock, timout)) > 0 && 1212 call->call_status == RPC_TIMEDOUT) 1213 ; 1214 1215 switch (cv_wait_ret) { 1216 case 0: 1217 /* 1218 * If we got out of the above loop with 1219 * cv_timedwait_sig() returning 0, then we were 1220 * interrupted regardless what call_status is. 1221 */ 1222 interrupted = 1; 1223 break; 1224 case -1: 1225 /* cv_timedwait_sig() timed out */ 1226 break; 1227 default: 1228 1229 /* 1230 * We were cv_signaled(). If we didn't 1231 * get a successful call_status and returned 1232 * before time expired, delay up to clnt_cots_min_tout 1233 * seconds so that the caller doesn't immediately 1234 * try to call us again and thus force the 1235 * same condition that got us here (such 1236 * as a RPC_XPRTFAILED due to the server not 1237 * listening on the end-point. 1238 */ 1239 if (call->call_status != RPC_SUCCESS) { 1240 clock_t curlbolt; 1241 clock_t diff; 1242 1243 curlbolt = ddi_get_lbolt(); 1244 ticks = clnt_cots_min_tout * 1245 drv_usectohz(1000000); 1246 diff = curlbolt - oldlbolt; 1247 if (diff < ticks) { 1248 delay_first = TRUE; 1249 if (diff > 0) 1250 ticks -= diff; 1251 } 1252 } 1253 break; 1254 } 1255 1256 if (lwp != NULL) 1257 lwp->lwp_nostop--; 1258 } 1259 /* 1260 * Get the reply message, if any. This will be freed at the end 1261 * whether or not an error occurred. 1262 */ 1263 mp = call->call_reply; 1264 call->call_reply = NULL; 1265 1266 /* 1267 * call_err is the error info when the call is on dispatch queue. 1268 * cku_err is the error info returned to the caller. 1269 * Sync cku_err with call_err for local message processing. 1270 */ 1271 1272 status = call->call_status; 1273 p->cku_err = call->call_err; 1274 mutex_exit(&call->call_lock); 1275 1276 if (status != RPC_SUCCESS) { 1277 switch (status) { 1278 case RPC_TIMEDOUT: 1279 now = ddi_get_lbolt(); 1280 if (interrupted) { 1281 COTSRCSTAT_INCR(p->cku_stats, rcintrs); 1282 p->cku_err.re_status = RPC_INTR; 1283 p->cku_err.re_errno = EINTR; 1284 RPCLOG(1, "clnt_cots_kcallit: xid 0x%x", 1285 p->cku_xid); 1286 RPCLOG(1, "signal interrupted at %ld", now); 1287 RPCLOG(1, ", was sent at %ld\n", time_sent); 1288 } else { 1289 COTSRCSTAT_INCR(p->cku_stats, rctimeouts); 1290 p->cku_err.re_errno = ETIMEDOUT; 1291 RPCLOG(1, "clnt_cots_kcallit: timed out at %ld", 1292 now); 1293 RPCLOG(1, ", was sent at %ld\n", time_sent); 1294 } 1295 break; 1296 1297 case RPC_XPRTFAILED: 1298 if (p->cku_err.re_errno == 0) 1299 p->cku_err.re_errno = EIO; 1300 1301 RPCLOG(1, "clnt_cots_kcallit: transport failed: %d\n", 1302 p->cku_err.re_errno); 1303 break; 1304 1305 case RPC_SYSTEMERROR: 1306 ASSERT(p->cku_err.re_errno); 1307 RPCLOG(1, "clnt_cots_kcallit: system error: %d\n", 1308 p->cku_err.re_errno); 1309 break; 1310 1311 default: 1312 p->cku_err.re_status = RPC_SYSTEMERROR; 1313 p->cku_err.re_errno = EIO; 1314 RPCLOG(1, "clnt_cots_kcallit: error: %s\n", 1315 clnt_sperrno(status)); 1316 break; 1317 } 1318 if (p->cku_err.re_status != RPC_TIMEDOUT) { 1319 1320 if (p->cku_flags & CKU_ONQUEUE) { 1321 call_table_remove(call); 1322 p->cku_flags &= ~CKU_ONQUEUE; 1323 } 1324 1325 RPCLOG(64, "clnt_cots_kcallit: non TIMEOUT so xid 0x%x " 1326 "taken off dispatch list\n", p->cku_xid); 1327 if (call->call_reply) { 1328 freemsg(call->call_reply); 1329 call->call_reply = NULL; 1330 } 1331 } else if (wait.tv_sec != 0) { 1332 /* 1333 * We've sent the request over TCP and so we have 1334 * every reason to believe it will get 1335 * delivered. In which case returning a timeout is not 1336 * appropriate. 1337 */ 1338 if (p->cku_progress == TRUE && 1339 p->cku_recv_attempts < clnt_cots_maxrecv) { 1340 p->cku_err.re_status = RPC_INPROGRESS; 1341 } 1342 } 1343 goto cots_done; 1344 } 1345 1346 xdrs = &p->cku_inxdr; 1347 xdrmblk_init(xdrs, mp, XDR_DECODE, 0); 1348 1349 reply_msg.rm_direction = REPLY; 1350 reply_msg.rm_reply.rp_stat = MSG_ACCEPTED; 1351 reply_msg.acpted_rply.ar_stat = SUCCESS; 1352 1353 reply_msg.acpted_rply.ar_verf = _null_auth; 1354 /* 1355 * xdr_results will be done in AUTH_UNWRAP. 1356 */ 1357 reply_msg.acpted_rply.ar_results.where = NULL; 1358 reply_msg.acpted_rply.ar_results.proc = xdr_void; 1359 1360 if (xdr_replymsg(xdrs, &reply_msg)) { 1361 enum clnt_stat re_status; 1362 1363 _seterr_reply(&reply_msg, &p->cku_err); 1364 1365 re_status = p->cku_err.re_status; 1366 if (re_status == RPC_SUCCESS) { 1367 /* 1368 * Reply is good, check auth. 1369 */ 1370 if (!AUTH_VALIDATE(h->cl_auth, 1371 &reply_msg.acpted_rply.ar_verf)) { 1372 COTSRCSTAT_INCR(p->cku_stats, rcbadverfs); 1373 RPCLOG0(1, "clnt_cots_kcallit: validation " 1374 "failure\n"); 1375 freemsg(mp); 1376 (void) xdr_rpc_free_verifier(xdrs, &reply_msg); 1377 XDR_DESTROY(xdrs); 1378 mutex_enter(&call->call_lock); 1379 if (call->call_reply == NULL) 1380 call->call_status = RPC_TIMEDOUT; 1381 mutex_exit(&call->call_lock); 1382 goto read_again; 1383 } else if (!AUTH_UNWRAP(h->cl_auth, xdrs, 1384 xdr_results, resultsp)) { 1385 RPCLOG0(1, "clnt_cots_kcallit: validation " 1386 "failure (unwrap)\n"); 1387 p->cku_err.re_status = RPC_CANTDECODERES; 1388 p->cku_err.re_errno = EIO; 1389 } 1390 } else { 1391 /* set errno in case we can't recover */ 1392 if (re_status != RPC_VERSMISMATCH && 1393 re_status != RPC_AUTHERROR && 1394 re_status != RPC_PROGVERSMISMATCH) 1395 p->cku_err.re_errno = EIO; 1396 1397 if (re_status == RPC_AUTHERROR) { 1398 /* 1399 * Maybe our credential need to be refreshed 1400 */ 1401 if (cm_entry) { 1402 /* 1403 * There is the potential that the 1404 * cm_entry has/will be marked dead, 1405 * so drop the connection altogether, 1406 * force REFRESH to establish new 1407 * connection. 1408 */ 1409 connmgr_cancelconn(cm_entry); 1410 cm_entry = NULL; 1411 } 1412 1413 (void) xdr_rpc_free_verifier(xdrs, 1414 &reply_msg); 1415 XDR_DESTROY(xdrs); 1416 1417 if (p->cku_flags & CKU_ONQUEUE) { 1418 call_table_remove(call); 1419 p->cku_flags &= ~CKU_ONQUEUE; 1420 } 1421 RPCLOG(64, 1422 "clnt_cots_kcallit: AUTH_ERROR, xid" 1423 " 0x%x removed off dispatch list\n", 1424 p->cku_xid); 1425 if (call->call_reply) { 1426 freemsg(call->call_reply); 1427 call->call_reply = NULL; 1428 } 1429 1430 if ((refreshes > 0) && 1431 AUTH_REFRESH(h->cl_auth, &reply_msg, 1432 p->cku_cred)) { 1433 refreshes--; 1434 freemsg(mp); 1435 mp = NULL; 1436 1437 COTSRCSTAT_INCR(p->cku_stats, 1438 rcbadcalls); 1439 COTSRCSTAT_INCR(p->cku_stats, 1440 rcnewcreds); 1441 goto call_again; 1442 } 1443 1444 /* 1445 * We have used the client handle to 1446 * do an AUTH_REFRESH and the RPC status may 1447 * be set to RPC_SUCCESS; Let's make sure to 1448 * set it to RPC_AUTHERROR. 1449 */ 1450 p->cku_err.re_status = RPC_AUTHERROR; 1451 1452 /* 1453 * Map recoverable and unrecoverable 1454 * authentication errors to appropriate errno 1455 */ 1456 switch (p->cku_err.re_why) { 1457 case AUTH_TOOWEAK: 1458 /* 1459 * This could be a failure where the 1460 * server requires use of a reserved 1461 * port, check and optionally set the 1462 * client handle useresvport trying 1463 * one more time. Next go round we 1464 * fall out with the tooweak error. 1465 */ 1466 if (p->cku_useresvport != 1) { 1467 p->cku_useresvport = 1; 1468 p->cku_xid = 0; 1469 freemsg(mp); 1470 mp = NULL; 1471 goto call_again; 1472 } 1473 /* FALLTHRU */ 1474 case AUTH_BADCRED: 1475 case AUTH_BADVERF: 1476 case AUTH_INVALIDRESP: 1477 case AUTH_FAILED: 1478 case RPCSEC_GSS_NOCRED: 1479 case RPCSEC_GSS_FAILED: 1480 p->cku_err.re_errno = EACCES; 1481 break; 1482 case AUTH_REJECTEDCRED: 1483 case AUTH_REJECTEDVERF: 1484 default: p->cku_err.re_errno = EIO; 1485 break; 1486 } 1487 RPCLOG(1, "clnt_cots_kcallit : authentication" 1488 " failed with RPC_AUTHERROR of type %d\n", 1489 (int)p->cku_err.re_why); 1490 goto cots_done; 1491 } 1492 } 1493 } else { 1494 /* reply didn't decode properly. */ 1495 p->cku_err.re_status = RPC_CANTDECODERES; 1496 p->cku_err.re_errno = EIO; 1497 RPCLOG0(1, "clnt_cots_kcallit: decode failure\n"); 1498 } 1499 1500 (void) xdr_rpc_free_verifier(xdrs, &reply_msg); 1501 XDR_DESTROY(xdrs); 1502 1503 if (p->cku_flags & CKU_ONQUEUE) { 1504 call_table_remove(call); 1505 p->cku_flags &= ~CKU_ONQUEUE; 1506 } 1507 1508 RPCLOG(64, "clnt_cots_kcallit: xid 0x%x taken off dispatch list", 1509 p->cku_xid); 1510 RPCLOG(64, " status is %s\n", clnt_sperrno(p->cku_err.re_status)); 1511 cots_done: 1512 if (cm_entry) 1513 connmgr_release(cm_entry); 1514 1515 if (mp != NULL) 1516 freemsg(mp); 1517 if ((p->cku_flags & CKU_ONQUEUE) == 0 && call->call_reply) { 1518 freemsg(call->call_reply); 1519 call->call_reply = NULL; 1520 } 1521 if (p->cku_err.re_status != RPC_SUCCESS) { 1522 RPCLOG0(1, "clnt_cots_kcallit: tail-end failure\n"); 1523 COTSRCSTAT_INCR(p->cku_stats, rcbadcalls); 1524 } 1525 1526 /* 1527 * No point in delaying if the zone is going away. 1528 */ 1529 if (delay_first == TRUE && 1530 !(zone_status_get(curproc->p_zone) >= ZONE_IS_SHUTTING_DOWN)) { 1531 if (clnt_delay(ticks, h->cl_nosignal) == EINTR) { 1532 p->cku_err.re_errno = EINTR; 1533 p->cku_err.re_status = RPC_INTR; 1534 } 1535 } 1536 return (p->cku_err.re_status); 1537 } 1538 1539 /* 1540 * Kinit routine for cots. This sets up the correct operations in 1541 * the client handle, as the handle may have previously been a clts 1542 * handle, and clears the xid field so there is no way a new call 1543 * could be mistaken for a retry. It also sets in the handle the 1544 * information that is passed at create/kinit time but needed at 1545 * call time, as cots creates the transport at call time - device, 1546 * address of the server, protocol family. 1547 */ 1548 void 1549 clnt_cots_kinit(CLIENT *h, dev_t dev, int family, struct netbuf *addr, 1550 int max_msgsize, cred_t *cred) 1551 { 1552 /* LINTED pointer alignment */ 1553 cku_private_t *p = htop(h); 1554 calllist_t *call = &p->cku_call; 1555 1556 h->cl_ops = &tcp_ops; 1557 if (p->cku_flags & CKU_ONQUEUE) { 1558 call_table_remove(call); 1559 p->cku_flags &= ~CKU_ONQUEUE; 1560 RPCLOG(64, "clnt_cots_kinit: removing call for xid 0x%x from" 1561 " dispatch list\n", p->cku_xid); 1562 } 1563 1564 if (call->call_reply != NULL) { 1565 freemsg(call->call_reply); 1566 call->call_reply = NULL; 1567 } 1568 1569 call->call_bucket = NULL; 1570 call->call_hash = 0; 1571 1572 /* 1573 * We don't clear cku_flags here, because clnt_cots_kcallit() 1574 * takes care of handling the cku_flags reset. 1575 */ 1576 p->cku_xid = 0; 1577 p->cku_device = dev; 1578 p->cku_addrfmly = family; 1579 p->cku_cred = cred; 1580 1581 if (p->cku_addr.maxlen < addr->len) { 1582 if (p->cku_addr.maxlen != 0 && p->cku_addr.buf != NULL) 1583 kmem_free(p->cku_addr.buf, p->cku_addr.maxlen); 1584 p->cku_addr.buf = kmem_zalloc(addr->maxlen, KM_SLEEP); 1585 p->cku_addr.maxlen = addr->maxlen; 1586 } 1587 1588 p->cku_addr.len = addr->len; 1589 bcopy(addr->buf, p->cku_addr.buf, addr->len); 1590 1591 /* 1592 * If the current sanity check size in rpcmod is smaller 1593 * than the size needed, then increase the sanity check. 1594 */ 1595 if (max_msgsize != 0 && clnt_max_msg_sizep != NULL && 1596 max_msgsize > *clnt_max_msg_sizep) { 1597 mutex_enter(&clnt_max_msg_lock); 1598 if (max_msgsize > *clnt_max_msg_sizep) 1599 *clnt_max_msg_sizep = max_msgsize; 1600 mutex_exit(&clnt_max_msg_lock); 1601 } 1602 } 1603 1604 /* 1605 * ksettimers is a no-op for cots, with the exception of setting the xid. 1606 */ 1607 /* ARGSUSED */ 1608 static int 1609 clnt_cots_ksettimers(CLIENT *h, struct rpc_timers *t, struct rpc_timers *all, 1610 int minimum, void (*feedback)(int, int, caddr_t), caddr_t arg, uint32_t xid) 1611 { 1612 /* LINTED pointer alignment */ 1613 cku_private_t *p = htop(h); 1614 1615 if (xid) 1616 p->cku_xid = xid; 1617 COTSRCSTAT_INCR(p->cku_stats, rctimers); 1618 return (0); 1619 } 1620 1621 extern void rpc_poptimod(struct vnode *); 1622 extern int kstr_push(struct vnode *, char *); 1623 1624 int 1625 conn_kstat_update(kstat_t *ksp, int rw) 1626 { 1627 struct cm_xprt *cm_entry; 1628 struct cm_kstat_xprt *cm_ksp_data; 1629 uchar_t *b; 1630 char *fbuf; 1631 1632 if (rw == KSTAT_WRITE) 1633 return (EACCES); 1634 if (ksp == NULL || ksp->ks_private == NULL) 1635 return (EIO); 1636 cm_entry = (struct cm_xprt *)ksp->ks_private; 1637 cm_ksp_data = (struct cm_kstat_xprt *)ksp->ks_data; 1638 1639 cm_ksp_data->x_wq.value.ui32 = (uint32_t)(uintptr_t)cm_entry->x_wq; 1640 cm_ksp_data->x_family.value.ui32 = cm_entry->x_family; 1641 cm_ksp_data->x_rdev.value.ui32 = (uint32_t)cm_entry->x_rdev; 1642 cm_ksp_data->x_time.value.ui32 = cm_entry->x_time; 1643 cm_ksp_data->x_ref.value.ui32 = cm_entry->x_ref; 1644 cm_ksp_data->x_state.value.ui32 = cm_entry->x_state_flags; 1645 1646 if (cm_entry->x_server.buf) { 1647 fbuf = cm_ksp_data->x_server.value.str.addr.ptr; 1648 if (cm_entry->x_family == AF_INET && 1649 cm_entry->x_server.len == 1650 sizeof (struct sockaddr_in)) { 1651 struct sockaddr_in *sa; 1652 sa = (struct sockaddr_in *) 1653 cm_entry->x_server.buf; 1654 b = (uchar_t *)&sa->sin_addr; 1655 (void) sprintf(fbuf, 1656 "%d.%d.%d.%d", b[0] & 0xFF, b[1] & 0xFF, 1657 b[2] & 0xFF, b[3] & 0xFF); 1658 cm_ksp_data->x_port.value.ui32 = ntohs(sa->sin_port); 1659 } else if (cm_entry->x_family == AF_INET6 && 1660 cm_entry->x_server.len >= 1661 sizeof (struct sockaddr_in6)) { 1662 /* extract server IP address & port */ 1663 struct sockaddr_in6 *sin6; 1664 sin6 = (struct sockaddr_in6 *)cm_entry->x_server.buf; 1665 (void) kinet_ntop6((uchar_t *)&sin6->sin6_addr, fbuf, 1666 INET6_ADDRSTRLEN); 1667 cm_ksp_data->x_port.value.ui32 = ntohs(sin6->sin6_port); 1668 } else { 1669 struct sockaddr_in *sa; 1670 1671 sa = (struct sockaddr_in *)cm_entry->x_server.buf; 1672 b = (uchar_t *)&sa->sin_addr; 1673 (void) sprintf(fbuf, 1674 "%d.%d.%d.%d", b[0] & 0xFF, b[1] & 0xFF, 1675 b[2] & 0xFF, b[3] & 0xFF); 1676 } 1677 KSTAT_NAMED_STR_BUFLEN(&cm_ksp_data->x_server) = 1678 strlen(fbuf) + 1; 1679 } 1680 1681 return (0); 1682 } 1683 1684 1685 /* 1686 * We want a version of delay which is interruptible by a UNIX signal 1687 * Return EINTR if an interrupt occured. 1688 */ 1689 static int 1690 clnt_delay(clock_t ticks, bool_t nosignal) 1691 { 1692 if (nosignal == TRUE) { 1693 delay(ticks); 1694 return (0); 1695 } 1696 return (delay_sig(ticks)); 1697 } 1698 1699 /* 1700 * Wait for a connection until a timeout, or until we are 1701 * signalled that there has been a connection state change. 1702 */ 1703 static enum clnt_stat 1704 connmgr_cwait(struct cm_xprt *cm_entry, const struct timeval *waitp, 1705 bool_t nosignal) 1706 { 1707 bool_t interrupted; 1708 clock_t timout, cv_stat; 1709 enum clnt_stat clstat; 1710 unsigned int old_state; 1711 1712 ASSERT(MUTEX_HELD(&connmgr_lock)); 1713 /* 1714 * We wait for the transport connection to be made, or an 1715 * indication that it could not be made. 1716 */ 1717 clstat = RPC_TIMEDOUT; 1718 interrupted = FALSE; 1719 1720 old_state = cm_entry->x_state_flags; 1721 /* 1722 * Now loop until cv_timedwait{_sig} returns because of 1723 * a signal(0) or timeout(-1) or cv_signal(>0). But it may be 1724 * cv_signalled for various other reasons too. So loop 1725 * until there is a state change on the connection. 1726 */ 1727 1728 timout = waitp->tv_sec * drv_usectohz(1000000) + 1729 drv_usectohz(waitp->tv_usec) + ddi_get_lbolt(); 1730 1731 if (nosignal) { 1732 while ((cv_stat = cv_timedwait(&cm_entry->x_conn_cv, 1733 &connmgr_lock, timout)) > 0 && 1734 cm_entry->x_state_flags == old_state) 1735 ; 1736 } else { 1737 while ((cv_stat = cv_timedwait_sig(&cm_entry->x_conn_cv, 1738 &connmgr_lock, timout)) > 0 && 1739 cm_entry->x_state_flags == old_state) 1740 ; 1741 1742 if (cv_stat == 0) /* got intr signal? */ 1743 interrupted = TRUE; 1744 } 1745 1746 if ((cm_entry->x_state_flags & (X_BADSTATES|X_CONNECTED)) == 1747 X_CONNECTED) { 1748 clstat = RPC_SUCCESS; 1749 } else { 1750 if (interrupted == TRUE) 1751 clstat = RPC_INTR; 1752 RPCLOG(1, "connmgr_cwait: can't connect, error: %s\n", 1753 clnt_sperrno(clstat)); 1754 } 1755 1756 return (clstat); 1757 } 1758 1759 /* 1760 * Primary interface for how RPC grabs a connection. 1761 */ 1762 static struct cm_xprt * 1763 connmgr_wrapget( 1764 struct netbuf *retryaddr, 1765 const struct timeval *waitp, 1766 cku_private_t *p) 1767 { 1768 struct cm_xprt *cm_entry; 1769 1770 cm_entry = connmgr_get(retryaddr, waitp, &p->cku_addr, p->cku_addrfmly, 1771 &p->cku_srcaddr, &p->cku_err, p->cku_device, 1772 p->cku_client.cl_nosignal, p->cku_useresvport, p->cku_cred); 1773 1774 if (cm_entry == NULL) { 1775 /* 1776 * Re-map the call status to RPC_INTR if the err code is 1777 * EINTR. This can happen if calls status is RPC_TLIERROR. 1778 * However, don't re-map if signalling has been turned off. 1779 * XXX Really need to create a separate thread whenever 1780 * there isn't an existing connection. 1781 */ 1782 if (p->cku_err.re_errno == EINTR) { 1783 if (p->cku_client.cl_nosignal == TRUE) 1784 p->cku_err.re_errno = EIO; 1785 else 1786 p->cku_err.re_status = RPC_INTR; 1787 } 1788 } 1789 1790 return (cm_entry); 1791 } 1792 1793 /* 1794 * Obtains a transport to the server specified in addr. If a suitable transport 1795 * does not already exist in the list of cached transports, a new connection 1796 * is created, connected, and added to the list. The connection is for sending 1797 * only - the reply message may come back on another transport connection. 1798 * 1799 * To implement round-robin load balancing with multiple client connections, 1800 * the last entry on the list is always selected. Once the entry is selected 1801 * it's re-inserted to the head of the list. 1802 */ 1803 static struct cm_xprt * 1804 connmgr_get( 1805 struct netbuf *retryaddr, 1806 const struct timeval *waitp, /* changed to a ptr to converse stack */ 1807 struct netbuf *destaddr, 1808 int addrfmly, 1809 struct netbuf *srcaddr, 1810 struct rpc_err *rpcerr, 1811 dev_t device, 1812 bool_t nosignal, 1813 int useresvport, 1814 cred_t *cr) 1815 { 1816 struct cm_xprt *cm_entry; 1817 struct cm_xprt *lru_entry; 1818 struct cm_xprt **cmp, **prev; 1819 queue_t *wq; 1820 TIUSER *tiptr; 1821 int i; 1822 int retval; 1823 int tidu_size; 1824 bool_t connected; 1825 zoneid_t zoneid = rpc_zoneid(); 1826 1827 /* 1828 * If the call is not a retry, look for a transport entry that 1829 * goes to the server of interest. 1830 */ 1831 mutex_enter(&connmgr_lock); 1832 1833 if (retryaddr == NULL) { 1834 use_new_conn: 1835 i = 0; 1836 cm_entry = lru_entry = NULL; 1837 1838 prev = cmp = &cm_hd; 1839 while ((cm_entry = *cmp) != NULL) { 1840 ASSERT(cm_entry != cm_entry->x_next); 1841 /* 1842 * Garbage collect conections that are marked 1843 * for needs disconnect. 1844 */ 1845 if (cm_entry->x_needdis) { 1846 CONN_HOLD(cm_entry); 1847 connmgr_dis_and_wait(cm_entry); 1848 connmgr_release(cm_entry); 1849 /* 1850 * connmgr_lock could have been 1851 * dropped for the disconnect 1852 * processing so start over. 1853 */ 1854 goto use_new_conn; 1855 } 1856 1857 /* 1858 * Garbage collect the dead connections that have 1859 * no threads working on them. 1860 */ 1861 if ((cm_entry->x_state_flags & (X_DEAD|X_THREAD)) == 1862 X_DEAD) { 1863 mutex_enter(&cm_entry->x_lock); 1864 if (cm_entry->x_ref != 0) { 1865 /* 1866 * Currently in use. 1867 * Cleanup later. 1868 */ 1869 cmp = &cm_entry->x_next; 1870 mutex_exit(&cm_entry->x_lock); 1871 continue; 1872 } 1873 mutex_exit(&cm_entry->x_lock); 1874 *cmp = cm_entry->x_next; 1875 mutex_exit(&connmgr_lock); 1876 connmgr_close(cm_entry); 1877 mutex_enter(&connmgr_lock); 1878 goto use_new_conn; 1879 } 1880 1881 1882 if ((cm_entry->x_state_flags & X_BADSTATES) == 0 && 1883 cm_entry->x_zoneid == zoneid && 1884 cm_entry->x_rdev == device && 1885 destaddr->len == cm_entry->x_server.len && 1886 bcmp(destaddr->buf, cm_entry->x_server.buf, 1887 destaddr->len) == 0) { 1888 /* 1889 * If the matching entry isn't connected, 1890 * attempt to reconnect it. 1891 */ 1892 if (cm_entry->x_connected == FALSE) { 1893 /* 1894 * We don't go through trying 1895 * to find the least recently 1896 * used connected because 1897 * connmgr_reconnect() briefly 1898 * dropped the connmgr_lock, 1899 * allowing a window for our 1900 * accounting to be messed up. 1901 * In any case, a re-connected 1902 * connection is as good as 1903 * a LRU connection. 1904 */ 1905 return (connmgr_wrapconnect(cm_entry, 1906 waitp, destaddr, addrfmly, srcaddr, 1907 rpcerr, TRUE, nosignal, cr)); 1908 } 1909 i++; 1910 1911 /* keep track of the last entry */ 1912 lru_entry = cm_entry; 1913 prev = cmp; 1914 } 1915 cmp = &cm_entry->x_next; 1916 } 1917 1918 if (i > clnt_max_conns) { 1919 RPCLOG(8, "connmgr_get: too many conns, dooming entry" 1920 " %p\n", (void *)lru_entry->x_tiptr); 1921 lru_entry->x_doomed = TRUE; 1922 goto use_new_conn; 1923 } 1924 1925 /* 1926 * If we are at the maximum number of connections to 1927 * the server, hand back the least recently used one. 1928 */ 1929 if (i == clnt_max_conns) { 1930 /* 1931 * Copy into the handle the source address of 1932 * the connection, which we will use in case of 1933 * a later retry. 1934 */ 1935 if (srcaddr->len != lru_entry->x_src.len) { 1936 if (srcaddr->len > 0) 1937 kmem_free(srcaddr->buf, 1938 srcaddr->maxlen); 1939 srcaddr->buf = kmem_zalloc( 1940 lru_entry->x_src.len, KM_SLEEP); 1941 srcaddr->maxlen = srcaddr->len = 1942 lru_entry->x_src.len; 1943 } 1944 bcopy(lru_entry->x_src.buf, srcaddr->buf, srcaddr->len); 1945 RPCLOG(2, "connmgr_get: call going out on %p\n", 1946 (void *)lru_entry); 1947 lru_entry->x_time = ddi_get_lbolt(); 1948 CONN_HOLD(lru_entry); 1949 1950 if ((i > 1) && (prev != &cm_hd)) { 1951 /* 1952 * remove and re-insert entry at head of list. 1953 */ 1954 *prev = lru_entry->x_next; 1955 lru_entry->x_next = cm_hd; 1956 cm_hd = lru_entry; 1957 } 1958 1959 mutex_exit(&connmgr_lock); 1960 return (lru_entry); 1961 } 1962 1963 } else { 1964 /* 1965 * This is the retry case (retryaddr != NULL). Retries must 1966 * be sent on the same source port as the original call. 1967 */ 1968 1969 /* 1970 * Walk the list looking for a connection with a source address 1971 * that matches the retry address. 1972 */ 1973 start_retry_loop: 1974 cmp = &cm_hd; 1975 while ((cm_entry = *cmp) != NULL) { 1976 ASSERT(cm_entry != cm_entry->x_next); 1977 1978 /* 1979 * determine if this connection matches the passed 1980 * in retry address. If it does not match, advance 1981 * to the next element on the list. 1982 */ 1983 if (zoneid != cm_entry->x_zoneid || 1984 device != cm_entry->x_rdev || 1985 retryaddr->len != cm_entry->x_src.len || 1986 bcmp(retryaddr->buf, cm_entry->x_src.buf, 1987 retryaddr->len) != 0) { 1988 cmp = &cm_entry->x_next; 1989 continue; 1990 } 1991 /* 1992 * Garbage collect conections that are marked 1993 * for needs disconnect. 1994 */ 1995 if (cm_entry->x_needdis) { 1996 CONN_HOLD(cm_entry); 1997 connmgr_dis_and_wait(cm_entry); 1998 connmgr_release(cm_entry); 1999 /* 2000 * connmgr_lock could have been 2001 * dropped for the disconnect 2002 * processing so start over. 2003 */ 2004 goto start_retry_loop; 2005 } 2006 /* 2007 * Garbage collect the dead connections that have 2008 * no threads working on them. 2009 */ 2010 if ((cm_entry->x_state_flags & (X_DEAD|X_THREAD)) == 2011 X_DEAD) { 2012 mutex_enter(&cm_entry->x_lock); 2013 if (cm_entry->x_ref != 0) { 2014 /* 2015 * Currently in use. 2016 * Cleanup later. 2017 */ 2018 cmp = &cm_entry->x_next; 2019 mutex_exit(&cm_entry->x_lock); 2020 continue; 2021 } 2022 mutex_exit(&cm_entry->x_lock); 2023 *cmp = cm_entry->x_next; 2024 mutex_exit(&connmgr_lock); 2025 connmgr_close(cm_entry); 2026 mutex_enter(&connmgr_lock); 2027 goto start_retry_loop; 2028 } 2029 2030 /* 2031 * Sanity check: if the connection with our source 2032 * port is going to some other server, something went 2033 * wrong, as we never delete connections (i.e. release 2034 * ports) unless they have been idle. In this case, 2035 * it is probably better to send the call out using 2036 * a new source address than to fail it altogether, 2037 * since that port may never be released. 2038 */ 2039 if (destaddr->len != cm_entry->x_server.len || 2040 bcmp(destaddr->buf, cm_entry->x_server.buf, 2041 destaddr->len) != 0) { 2042 RPCLOG(1, "connmgr_get: tiptr %p" 2043 " is going to a different server" 2044 " with the port that belongs" 2045 " to us!\n", (void *)cm_entry->x_tiptr); 2046 retryaddr = NULL; 2047 goto use_new_conn; 2048 } 2049 2050 /* 2051 * If the connection of interest is not connected and we 2052 * can't reconnect it, then the server is probably 2053 * still down. Return NULL to the caller and let it 2054 * retry later if it wants to. We have a delay so the 2055 * machine doesn't go into a tight retry loop. If the 2056 * entry was already connected, or the reconnected was 2057 * successful, return this entry. 2058 */ 2059 if (cm_entry->x_connected == FALSE) { 2060 return (connmgr_wrapconnect(cm_entry, 2061 waitp, destaddr, addrfmly, NULL, 2062 rpcerr, TRUE, nosignal, cr)); 2063 } else { 2064 CONN_HOLD(cm_entry); 2065 2066 cm_entry->x_time = ddi_get_lbolt(); 2067 mutex_exit(&connmgr_lock); 2068 RPCLOG(2, "connmgr_get: found old " 2069 "transport %p for retry\n", 2070 (void *)cm_entry); 2071 return (cm_entry); 2072 } 2073 } 2074 2075 /* 2076 * We cannot find an entry in the list for this retry. 2077 * Either the entry has been removed temporarily to be 2078 * reconnected by another thread, or the original call 2079 * got a port but never got connected, 2080 * and hence the transport never got put in the 2081 * list. Fall through to the "create new connection" code - 2082 * the former case will fail there trying to rebind the port, 2083 * and the later case (and any other pathological cases) will 2084 * rebind and reconnect and not hang the client machine. 2085 */ 2086 RPCLOG0(8, "connmgr_get: no entry in list for retry\n"); 2087 } 2088 /* 2089 * Set up a transport entry in the connection manager's list. 2090 */ 2091 cm_entry = (struct cm_xprt *) 2092 kmem_zalloc(sizeof (struct cm_xprt), KM_SLEEP); 2093 2094 cm_entry->x_server.buf = kmem_zalloc(destaddr->len, KM_SLEEP); 2095 bcopy(destaddr->buf, cm_entry->x_server.buf, destaddr->len); 2096 cm_entry->x_server.len = cm_entry->x_server.maxlen = destaddr->len; 2097 2098 cm_entry->x_state_flags = X_THREAD; 2099 cm_entry->x_ref = 1; 2100 cm_entry->x_family = addrfmly; 2101 cm_entry->x_rdev = device; 2102 cm_entry->x_zoneid = zoneid; 2103 mutex_init(&cm_entry->x_lock, NULL, MUTEX_DEFAULT, NULL); 2104 cv_init(&cm_entry->x_cv, NULL, CV_DEFAULT, NULL); 2105 cv_init(&cm_entry->x_conn_cv, NULL, CV_DEFAULT, NULL); 2106 cv_init(&cm_entry->x_dis_cv, NULL, CV_DEFAULT, NULL); 2107 2108 /* 2109 * Note that we add this partially initialized entry to the 2110 * connection list. This is so that we don't have connections to 2111 * the same server. 2112 * 2113 * Note that x_src is not initialized at this point. This is because 2114 * retryaddr might be NULL in which case x_src is whatever 2115 * t_kbind/bindresvport gives us. If another thread wants a 2116 * connection to the same server, seemingly we have an issue, but we 2117 * don't. If the other thread comes in with retryaddr == NULL, then it 2118 * will never look at x_src, and it will end up waiting in 2119 * connmgr_cwait() for the first thread to finish the connection 2120 * attempt. If the other thread comes in with retryaddr != NULL, then 2121 * that means there was a request sent on a connection, in which case 2122 * the the connection should already exist. Thus the first thread 2123 * never gets here ... it finds the connection it its server in the 2124 * connection list. 2125 * 2126 * But even if theory is wrong, in the retryaddr != NULL case, the 2nd 2127 * thread will skip us because x_src.len == 0. 2128 */ 2129 cm_entry->x_next = cm_hd; 2130 cm_hd = cm_entry; 2131 mutex_exit(&connmgr_lock); 2132 2133 /* 2134 * Either we didn't find an entry to the server of interest, or we 2135 * don't have the maximum number of connections to that server - 2136 * create a new connection. 2137 */ 2138 RPCLOG0(8, "connmgr_get: creating new connection\n"); 2139 rpcerr->re_status = RPC_TLIERROR; 2140 2141 i = t_kopen(NULL, device, FREAD|FWRITE|FNDELAY, &tiptr, zone_kcred()); 2142 if (i) { 2143 RPCLOG(1, "connmgr_get: can't open cots device, error %d\n", i); 2144 rpcerr->re_errno = i; 2145 connmgr_cancelconn(cm_entry); 2146 return (NULL); 2147 } 2148 rpc_poptimod(tiptr->fp->f_vnode); 2149 2150 if (i = strioctl(tiptr->fp->f_vnode, I_PUSH, (intptr_t)"rpcmod", 0, 2151 K_TO_K, kcred, &retval)) { 2152 RPCLOG(1, "connmgr_get: can't push cots module, %d\n", i); 2153 (void) t_kclose(tiptr, 1); 2154 rpcerr->re_errno = i; 2155 connmgr_cancelconn(cm_entry); 2156 return (NULL); 2157 } 2158 2159 if (i = strioctl(tiptr->fp->f_vnode, RPC_CLIENT, 0, 0, K_TO_K, 2160 kcred, &retval)) { 2161 RPCLOG(1, "connmgr_get: can't set client status with cots " 2162 "module, %d\n", i); 2163 (void) t_kclose(tiptr, 1); 2164 rpcerr->re_errno = i; 2165 connmgr_cancelconn(cm_entry); 2166 return (NULL); 2167 } 2168 2169 mutex_enter(&connmgr_lock); 2170 2171 wq = tiptr->fp->f_vnode->v_stream->sd_wrq->q_next; 2172 cm_entry->x_wq = wq; 2173 2174 mutex_exit(&connmgr_lock); 2175 2176 if (i = strioctl(tiptr->fp->f_vnode, I_PUSH, (intptr_t)"timod", 0, 2177 K_TO_K, kcred, &retval)) { 2178 RPCLOG(1, "connmgr_get: can't push timod, %d\n", i); 2179 (void) t_kclose(tiptr, 1); 2180 rpcerr->re_errno = i; 2181 connmgr_cancelconn(cm_entry); 2182 return (NULL); 2183 } 2184 2185 /* 2186 * If the caller has not specified reserved port usage then 2187 * take the system default. 2188 */ 2189 if (useresvport == -1) 2190 useresvport = clnt_cots_do_bindresvport; 2191 2192 if ((useresvport || retryaddr != NULL) && 2193 (addrfmly == AF_INET || addrfmly == AF_INET6)) { 2194 bool_t alloc_src = FALSE; 2195 2196 if (srcaddr->len != destaddr->len) { 2197 kmem_free(srcaddr->buf, srcaddr->maxlen); 2198 srcaddr->buf = kmem_zalloc(destaddr->len, KM_SLEEP); 2199 srcaddr->maxlen = destaddr->len; 2200 srcaddr->len = destaddr->len; 2201 alloc_src = TRUE; 2202 } 2203 2204 if ((i = bindresvport(tiptr, retryaddr, srcaddr, TRUE)) != 0) { 2205 (void) t_kclose(tiptr, 1); 2206 RPCLOG(1, "connmgr_get: couldn't bind, retryaddr: " 2207 "%p\n", (void *)retryaddr); 2208 2209 /* 2210 * 1225408: If we allocated a source address, then it 2211 * is either garbage or all zeroes. In that case 2212 * we need to clear srcaddr. 2213 */ 2214 if (alloc_src == TRUE) { 2215 kmem_free(srcaddr->buf, srcaddr->maxlen); 2216 srcaddr->maxlen = srcaddr->len = 0; 2217 srcaddr->buf = NULL; 2218 } 2219 rpcerr->re_errno = i; 2220 connmgr_cancelconn(cm_entry); 2221 return (NULL); 2222 } 2223 } else { 2224 if ((i = t_kbind(tiptr, NULL, NULL)) != 0) { 2225 RPCLOG(1, "clnt_cots_kcreate: t_kbind: %d\n", i); 2226 (void) t_kclose(tiptr, 1); 2227 rpcerr->re_errno = i; 2228 connmgr_cancelconn(cm_entry); 2229 return (NULL); 2230 } 2231 } 2232 2233 { 2234 /* 2235 * Keep the kernel stack lean. Don't move this call 2236 * declaration to the top of this function because a 2237 * call is declared in connmgr_wrapconnect() 2238 */ 2239 calllist_t call; 2240 2241 bzero(&call, sizeof (call)); 2242 cv_init(&call.call_cv, NULL, CV_DEFAULT, NULL); 2243 2244 /* 2245 * This is a bound end-point so don't close it's stream. 2246 */ 2247 connected = connmgr_connect(cm_entry, wq, destaddr, addrfmly, 2248 &call, &tidu_size, FALSE, waitp, nosignal, cr); 2249 *rpcerr = call.call_err; 2250 cv_destroy(&call.call_cv); 2251 2252 } 2253 2254 mutex_enter(&connmgr_lock); 2255 2256 /* 2257 * Set up a transport entry in the connection manager's list. 2258 */ 2259 cm_entry->x_src.buf = kmem_zalloc(srcaddr->len, KM_SLEEP); 2260 bcopy(srcaddr->buf, cm_entry->x_src.buf, srcaddr->len); 2261 cm_entry->x_src.len = cm_entry->x_src.maxlen = srcaddr->len; 2262 2263 cm_entry->x_tiptr = tiptr; 2264 cm_entry->x_time = ddi_get_lbolt(); 2265 2266 if (tiptr->tp_info.servtype == T_COTS_ORD) 2267 cm_entry->x_ordrel = TRUE; 2268 else 2269 cm_entry->x_ordrel = FALSE; 2270 2271 cm_entry->x_tidu_size = tidu_size; 2272 2273 if (cm_entry->x_early_disc) { 2274 /* 2275 * We need to check if a disconnect request has come 2276 * while we are connected, if so, then we need to 2277 * set rpcerr->re_status appropriately before returning 2278 * NULL to caller. 2279 */ 2280 if (rpcerr->re_status == RPC_SUCCESS) 2281 rpcerr->re_status = RPC_XPRTFAILED; 2282 cm_entry->x_connected = FALSE; 2283 cm_entry->x_dead = TRUE; 2284 } else 2285 cm_entry->x_connected = connected; 2286 2287 /* 2288 * There could be a discrepancy here such that 2289 * x_early_disc is TRUE yet connected is TRUE as well 2290 * and the connection is actually connected. In that case 2291 * lets be conservative and declare the connection as not 2292 * connected. 2293 */ 2294 cm_entry->x_early_disc = FALSE; 2295 cm_entry->x_needdis = (cm_entry->x_connected == FALSE); 2296 cm_entry->x_ctime = ddi_get_lbolt(); 2297 2298 /* 2299 * Notify any threads waiting that the connection attempt is done. 2300 */ 2301 cm_entry->x_thread = FALSE; 2302 cv_broadcast(&cm_entry->x_conn_cv); 2303 2304 if (cm_entry->x_connected == FALSE) { 2305 mutex_exit(&connmgr_lock); 2306 connmgr_release(cm_entry); 2307 return (NULL); 2308 } 2309 2310 mutex_exit(&connmgr_lock); 2311 2312 return (cm_entry); 2313 } 2314 2315 /* 2316 * Keep the cm_xprt entry on the connecton list when making a connection. This 2317 * is to prevent multiple connections to a slow server from appearing. 2318 * We use the bit field x_thread to tell if a thread is doing a connection 2319 * which keeps other interested threads from messing with connection. 2320 * Those other threads just wait if x_thread is set. 2321 * 2322 * If x_thread is not set, then we do the actual work of connecting via 2323 * connmgr_connect(). 2324 * 2325 * mutex convention: called with connmgr_lock held, returns with it released. 2326 */ 2327 static struct cm_xprt * 2328 connmgr_wrapconnect( 2329 struct cm_xprt *cm_entry, 2330 const struct timeval *waitp, 2331 struct netbuf *destaddr, 2332 int addrfmly, 2333 struct netbuf *srcaddr, 2334 struct rpc_err *rpcerr, 2335 bool_t reconnect, 2336 bool_t nosignal, 2337 cred_t *cr) 2338 { 2339 ASSERT(MUTEX_HELD(&connmgr_lock)); 2340 /* 2341 * Hold this entry as we are about to drop connmgr_lock. 2342 */ 2343 CONN_HOLD(cm_entry); 2344 2345 /* 2346 * If there is a thread already making a connection for us, then 2347 * wait for it to complete the connection. 2348 */ 2349 if (cm_entry->x_thread == TRUE) { 2350 rpcerr->re_status = connmgr_cwait(cm_entry, waitp, nosignal); 2351 2352 if (rpcerr->re_status != RPC_SUCCESS) { 2353 mutex_exit(&connmgr_lock); 2354 connmgr_release(cm_entry); 2355 return (NULL); 2356 } 2357 } else { 2358 bool_t connected; 2359 calllist_t call; 2360 2361 cm_entry->x_thread = TRUE; 2362 2363 while (cm_entry->x_needrel == TRUE) { 2364 cm_entry->x_needrel = FALSE; 2365 2366 connmgr_sndrel(cm_entry); 2367 delay(drv_usectohz(1000000)); 2368 2369 mutex_enter(&connmgr_lock); 2370 } 2371 2372 /* 2373 * If we need to send a T_DISCON_REQ, send one. 2374 */ 2375 connmgr_dis_and_wait(cm_entry); 2376 2377 mutex_exit(&connmgr_lock); 2378 2379 bzero(&call, sizeof (call)); 2380 cv_init(&call.call_cv, NULL, CV_DEFAULT, NULL); 2381 2382 connected = connmgr_connect(cm_entry, cm_entry->x_wq, 2383 destaddr, addrfmly, &call, &cm_entry->x_tidu_size, 2384 reconnect, waitp, nosignal, cr); 2385 2386 *rpcerr = call.call_err; 2387 cv_destroy(&call.call_cv); 2388 2389 mutex_enter(&connmgr_lock); 2390 2391 2392 if (cm_entry->x_early_disc) { 2393 /* 2394 * We need to check if a disconnect request has come 2395 * while we are connected, if so, then we need to 2396 * set rpcerr->re_status appropriately before returning 2397 * NULL to caller. 2398 */ 2399 if (rpcerr->re_status == RPC_SUCCESS) 2400 rpcerr->re_status = RPC_XPRTFAILED; 2401 cm_entry->x_connected = FALSE; 2402 cm_entry->x_dead = TRUE; 2403 } else 2404 cm_entry->x_connected = connected; 2405 2406 /* 2407 * There could be a discrepancy here such that 2408 * x_early_disc is TRUE yet connected is TRUE as well 2409 * and the connection is actually connected. In that case 2410 * lets be conservative and declare the connection as not 2411 * connected. 2412 */ 2413 2414 cm_entry->x_early_disc = FALSE; 2415 cm_entry->x_needdis = (cm_entry->x_connected == FALSE); 2416 2417 2418 /* 2419 * connmgr_connect() may have given up before the connection 2420 * actually timed out. So ensure that before the next 2421 * connection attempt we do a disconnect. 2422 */ 2423 cm_entry->x_ctime = ddi_get_lbolt(); 2424 cm_entry->x_thread = FALSE; 2425 2426 cv_broadcast(&cm_entry->x_conn_cv); 2427 2428 if (cm_entry->x_connected == FALSE) { 2429 mutex_exit(&connmgr_lock); 2430 connmgr_release(cm_entry); 2431 return (NULL); 2432 } 2433 } 2434 2435 if (srcaddr != NULL) { 2436 /* 2437 * Copy into the handle the 2438 * source address of the 2439 * connection, which we will use 2440 * in case of a later retry. 2441 */ 2442 if (srcaddr->len != cm_entry->x_src.len) { 2443 if (srcaddr->maxlen > 0) 2444 kmem_free(srcaddr->buf, srcaddr->maxlen); 2445 srcaddr->buf = kmem_zalloc(cm_entry->x_src.len, 2446 KM_SLEEP); 2447 srcaddr->maxlen = srcaddr->len = 2448 cm_entry->x_src.len; 2449 } 2450 bcopy(cm_entry->x_src.buf, srcaddr->buf, srcaddr->len); 2451 } 2452 cm_entry->x_time = ddi_get_lbolt(); 2453 mutex_exit(&connmgr_lock); 2454 return (cm_entry); 2455 } 2456 2457 /* 2458 * If we need to send a T_DISCON_REQ, send one. 2459 */ 2460 static void 2461 connmgr_dis_and_wait(struct cm_xprt *cm_entry) 2462 { 2463 ASSERT(MUTEX_HELD(&connmgr_lock)); 2464 for (;;) { 2465 while (cm_entry->x_needdis == TRUE) { 2466 RPCLOG(8, "connmgr_dis_and_wait: need " 2467 "T_DISCON_REQ for connection 0x%p\n", 2468 (void *)cm_entry); 2469 cm_entry->x_needdis = FALSE; 2470 cm_entry->x_waitdis = TRUE; 2471 2472 connmgr_snddis(cm_entry); 2473 2474 mutex_enter(&connmgr_lock); 2475 } 2476 2477 if (cm_entry->x_waitdis == TRUE) { 2478 clock_t timout; 2479 2480 RPCLOG(8, "connmgr_dis_and_wait waiting for " 2481 "T_DISCON_REQ's ACK for connection %p\n", 2482 (void *)cm_entry); 2483 2484 timout = clnt_cots_min_conntout * drv_usectohz(1000000); 2485 2486 /* 2487 * The TPI spec says that the T_DISCON_REQ 2488 * will get acknowledged, but in practice 2489 * the ACK may never get sent. So don't 2490 * block forever. 2491 */ 2492 (void) cv_reltimedwait(&cm_entry->x_dis_cv, 2493 &connmgr_lock, timout, TR_CLOCK_TICK); 2494 } 2495 /* 2496 * If we got the ACK, break. If we didn't, 2497 * then send another T_DISCON_REQ. 2498 */ 2499 if (cm_entry->x_waitdis == FALSE) { 2500 break; 2501 } else { 2502 RPCLOG(8, "connmgr_dis_and_wait: did" 2503 "not get T_DISCON_REQ's ACK for " 2504 "connection %p\n", (void *)cm_entry); 2505 cm_entry->x_needdis = TRUE; 2506 } 2507 } 2508 } 2509 2510 static void 2511 connmgr_cancelconn(struct cm_xprt *cm_entry) 2512 { 2513 /* 2514 * Mark the connection table entry as dead; the next thread that 2515 * goes through connmgr_release() will notice this and deal with it. 2516 */ 2517 mutex_enter(&connmgr_lock); 2518 cm_entry->x_dead = TRUE; 2519 2520 /* 2521 * Notify any threads waiting for the connection that it isn't 2522 * going to happen. 2523 */ 2524 cm_entry->x_thread = FALSE; 2525 cv_broadcast(&cm_entry->x_conn_cv); 2526 mutex_exit(&connmgr_lock); 2527 2528 connmgr_release(cm_entry); 2529 } 2530 2531 static void 2532 connmgr_close(struct cm_xprt *cm_entry) 2533 { 2534 mutex_enter(&cm_entry->x_lock); 2535 while (cm_entry->x_ref != 0) { 2536 /* 2537 * Must be a noninterruptible wait. 2538 */ 2539 cv_wait(&cm_entry->x_cv, &cm_entry->x_lock); 2540 } 2541 2542 if (cm_entry->x_tiptr != NULL) 2543 (void) t_kclose(cm_entry->x_tiptr, 1); 2544 2545 mutex_exit(&cm_entry->x_lock); 2546 if (cm_entry->x_ksp != NULL) { 2547 mutex_enter(&connmgr_lock); 2548 cm_entry->x_ksp->ks_private = NULL; 2549 mutex_exit(&connmgr_lock); 2550 2551 /* 2552 * Must free the buffer we allocated for the 2553 * server address in the update function 2554 */ 2555 if (((struct cm_kstat_xprt *)(cm_entry->x_ksp->ks_data))-> 2556 x_server.value.str.addr.ptr != NULL) 2557 kmem_free(((struct cm_kstat_xprt *)(cm_entry->x_ksp-> 2558 ks_data))->x_server.value.str.addr.ptr, 2559 INET6_ADDRSTRLEN); 2560 kmem_free(cm_entry->x_ksp->ks_data, 2561 cm_entry->x_ksp->ks_data_size); 2562 kstat_delete(cm_entry->x_ksp); 2563 } 2564 2565 mutex_destroy(&cm_entry->x_lock); 2566 cv_destroy(&cm_entry->x_cv); 2567 cv_destroy(&cm_entry->x_conn_cv); 2568 cv_destroy(&cm_entry->x_dis_cv); 2569 2570 if (cm_entry->x_server.buf != NULL) 2571 kmem_free(cm_entry->x_server.buf, cm_entry->x_server.maxlen); 2572 if (cm_entry->x_src.buf != NULL) 2573 kmem_free(cm_entry->x_src.buf, cm_entry->x_src.maxlen); 2574 kmem_free(cm_entry, sizeof (struct cm_xprt)); 2575 } 2576 2577 /* 2578 * Called by KRPC after sending the call message to release the connection 2579 * it was using. 2580 */ 2581 static void 2582 connmgr_release(struct cm_xprt *cm_entry) 2583 { 2584 mutex_enter(&cm_entry->x_lock); 2585 cm_entry->x_ref--; 2586 if (cm_entry->x_ref == 0) 2587 cv_signal(&cm_entry->x_cv); 2588 mutex_exit(&cm_entry->x_lock); 2589 } 2590 2591 /* 2592 * Set TCP receive and xmit buffer size for RPC connections. 2593 */ 2594 static bool_t 2595 connmgr_setbufsz(calllist_t *e, queue_t *wq, cred_t *cr) 2596 { 2597 int ok = FALSE; 2598 int val; 2599 2600 if (rpc_default_tcp_bufsz) 2601 return (FALSE); 2602 2603 /* 2604 * Only set new buffer size if it's larger than the system 2605 * default buffer size. If smaller buffer size is needed 2606 * then use /etc/system to set rpc_default_tcp_bufsz to 1. 2607 */ 2608 ok = connmgr_getopt_int(wq, SOL_SOCKET, SO_RCVBUF, &val, e, cr); 2609 if ((ok == TRUE) && (val < rpc_send_bufsz)) { 2610 ok = connmgr_setopt_int(wq, SOL_SOCKET, SO_RCVBUF, 2611 rpc_send_bufsz, e, cr); 2612 DTRACE_PROBE2(krpc__i__connmgr_rcvbufsz, 2613 int, ok, calllist_t *, e); 2614 } 2615 2616 ok = connmgr_getopt_int(wq, SOL_SOCKET, SO_SNDBUF, &val, e, cr); 2617 if ((ok == TRUE) && (val < rpc_recv_bufsz)) { 2618 ok = connmgr_setopt_int(wq, SOL_SOCKET, SO_SNDBUF, 2619 rpc_recv_bufsz, e, cr); 2620 DTRACE_PROBE2(krpc__i__connmgr_sndbufsz, 2621 int, ok, calllist_t *, e); 2622 } 2623 return (TRUE); 2624 } 2625 2626 /* 2627 * Given an open stream, connect to the remote. Returns true if connected, 2628 * false otherwise. 2629 */ 2630 static bool_t 2631 connmgr_connect( 2632 struct cm_xprt *cm_entry, 2633 queue_t *wq, 2634 struct netbuf *addr, 2635 int addrfmly, 2636 calllist_t *e, 2637 int *tidu_ptr, 2638 bool_t reconnect, 2639 const struct timeval *waitp, 2640 bool_t nosignal, 2641 cred_t *cr) 2642 { 2643 mblk_t *mp; 2644 struct T_conn_req *tcr; 2645 struct T_info_ack *tinfo; 2646 int interrupted, error; 2647 int tidu_size, kstat_instance; 2648 2649 /* if it's a reconnect, flush any lingering data messages */ 2650 if (reconnect) 2651 (void) putctl1(wq, M_FLUSH, FLUSHRW); 2652 2653 /* 2654 * Note: if the receiver uses SCM_UCRED/getpeerucred the pid will 2655 * appear as -1. 2656 */ 2657 mp = allocb_cred(sizeof (*tcr) + addr->len, cr, NOPID); 2658 if (mp == NULL) { 2659 /* 2660 * This is unfortunate, but we need to look up the stats for 2661 * this zone to increment the "memory allocation failed" 2662 * counter. curproc->p_zone is safe since we're initiating a 2663 * connection and not in some strange streams context. 2664 */ 2665 struct rpcstat *rpcstat; 2666 2667 rpcstat = zone_getspecific(rpcstat_zone_key, rpc_zone()); 2668 ASSERT(rpcstat != NULL); 2669 2670 RPCLOG0(1, "connmgr_connect: cannot alloc mp for " 2671 "sending conn request\n"); 2672 COTSRCSTAT_INCR(rpcstat->rpc_cots_client, rcnomem); 2673 e->call_status = RPC_SYSTEMERROR; 2674 e->call_reason = ENOSR; 2675 return (FALSE); 2676 } 2677 2678 /* Set TCP buffer size for RPC connections if needed */ 2679 if (addrfmly == AF_INET || addrfmly == AF_INET6) 2680 (void) connmgr_setbufsz(e, wq, cr); 2681 2682 mp->b_datap->db_type = M_PROTO; 2683 tcr = (struct T_conn_req *)mp->b_rptr; 2684 bzero(tcr, sizeof (*tcr)); 2685 tcr->PRIM_type = T_CONN_REQ; 2686 tcr->DEST_length = addr->len; 2687 tcr->DEST_offset = sizeof (struct T_conn_req); 2688 mp->b_wptr = mp->b_rptr + sizeof (*tcr); 2689 2690 bcopy(addr->buf, mp->b_wptr, tcr->DEST_length); 2691 mp->b_wptr += tcr->DEST_length; 2692 2693 RPCLOG(8, "connmgr_connect: sending conn request on queue " 2694 "%p", (void *)wq); 2695 RPCLOG(8, " call %p\n", (void *)wq); 2696 /* 2697 * We use the entry in the handle that is normally used for 2698 * waiting for RPC replies to wait for the connection accept. 2699 */ 2700 if (clnt_dispatch_send(wq, mp, e, 0, 0) != RPC_SUCCESS) { 2701 DTRACE_PROBE(krpc__e__connmgr__connect__cantsend); 2702 freemsg(mp); 2703 return (FALSE); 2704 } 2705 2706 mutex_enter(&clnt_pending_lock); 2707 2708 /* 2709 * We wait for the transport connection to be made, or an 2710 * indication that it could not be made. 2711 */ 2712 interrupted = 0; 2713 2714 /* 2715 * waitforack should have been called with T_OK_ACK, but the 2716 * present implementation needs to be passed T_INFO_ACK to 2717 * work correctly. 2718 */ 2719 error = waitforack(e, T_INFO_ACK, waitp, nosignal); 2720 if (error == EINTR) 2721 interrupted = 1; 2722 if (zone_status_get(curproc->p_zone) >= ZONE_IS_EMPTY) { 2723 /* 2724 * No time to lose; we essentially have been signaled to 2725 * quit. 2726 */ 2727 interrupted = 1; 2728 } 2729 #ifdef RPCDEBUG 2730 if (error == ETIME) 2731 RPCLOG0(8, "connmgr_connect: giving up " 2732 "on connection attempt; " 2733 "clnt_dispatch notifyconn " 2734 "diagnostic 'no one waiting for " 2735 "connection' should not be " 2736 "unexpected\n"); 2737 #endif 2738 if (e->call_prev) 2739 e->call_prev->call_next = e->call_next; 2740 else 2741 clnt_pending = e->call_next; 2742 if (e->call_next) 2743 e->call_next->call_prev = e->call_prev; 2744 mutex_exit(&clnt_pending_lock); 2745 2746 if (e->call_status != RPC_SUCCESS || error != 0) { 2747 if (interrupted) 2748 e->call_status = RPC_INTR; 2749 else if (error == ETIME) 2750 e->call_status = RPC_TIMEDOUT; 2751 else if (error == EPROTO) { 2752 e->call_status = RPC_SYSTEMERROR; 2753 e->call_reason = EPROTO; 2754 } 2755 2756 RPCLOG(8, "connmgr_connect: can't connect, status: " 2757 "%s\n", clnt_sperrno(e->call_status)); 2758 2759 if (e->call_reply) { 2760 freemsg(e->call_reply); 2761 e->call_reply = NULL; 2762 } 2763 2764 return (FALSE); 2765 } 2766 /* 2767 * The result of the "connection accept" is a T_info_ack 2768 * in the call_reply field. 2769 */ 2770 ASSERT(e->call_reply != NULL); 2771 mp = e->call_reply; 2772 e->call_reply = NULL; 2773 tinfo = (struct T_info_ack *)mp->b_rptr; 2774 2775 tidu_size = tinfo->TIDU_size; 2776 tidu_size -= (tidu_size % BYTES_PER_XDR_UNIT); 2777 if (tidu_size > COTS_DEFAULT_ALLOCSIZE || (tidu_size <= 0)) 2778 tidu_size = COTS_DEFAULT_ALLOCSIZE; 2779 *tidu_ptr = tidu_size; 2780 2781 freemsg(mp); 2782 2783 /* 2784 * Set up the pertinent options. NODELAY is so the transport doesn't 2785 * buffer up RPC messages on either end. This may not be valid for 2786 * all transports. Failure to set this option is not cause to 2787 * bail out so we return success anyway. Note that lack of NODELAY 2788 * or some other way to flush the message on both ends will cause 2789 * lots of retries and terrible performance. 2790 */ 2791 if (addrfmly == AF_INET || addrfmly == AF_INET6) { 2792 (void) connmgr_setopt(wq, IPPROTO_TCP, TCP_NODELAY, e, cr); 2793 if (e->call_status == RPC_XPRTFAILED) 2794 return (FALSE); 2795 } 2796 2797 /* 2798 * Since we have a connection, we now need to figure out if 2799 * we need to create a kstat. If x_ksp is not NULL then we 2800 * are reusing a connection and so we do not need to create 2801 * another kstat -- lets just return. 2802 */ 2803 if (cm_entry->x_ksp != NULL) 2804 return (TRUE); 2805 2806 /* 2807 * We need to increment rpc_kstat_instance atomically to prevent 2808 * two kstats being created with the same instance. 2809 */ 2810 kstat_instance = atomic_inc_32_nv((uint32_t *)&rpc_kstat_instance); 2811 2812 if ((cm_entry->x_ksp = kstat_create_zone("unix", kstat_instance, 2813 "rpc_cots_connections", "rpc", KSTAT_TYPE_NAMED, 2814 (uint_t)(sizeof (cm_kstat_xprt_t) / sizeof (kstat_named_t)), 2815 KSTAT_FLAG_VIRTUAL, cm_entry->x_zoneid)) == NULL) { 2816 return (TRUE); 2817 } 2818 2819 cm_entry->x_ksp->ks_lock = &connmgr_lock; 2820 cm_entry->x_ksp->ks_private = cm_entry; 2821 cm_entry->x_ksp->ks_data_size = ((INET6_ADDRSTRLEN * sizeof (char)) 2822 + sizeof (cm_kstat_template)); 2823 cm_entry->x_ksp->ks_data = kmem_alloc(cm_entry->x_ksp->ks_data_size, 2824 KM_SLEEP); 2825 bcopy(&cm_kstat_template, cm_entry->x_ksp->ks_data, 2826 cm_entry->x_ksp->ks_data_size); 2827 ((struct cm_kstat_xprt *)(cm_entry->x_ksp->ks_data))-> 2828 x_server.value.str.addr.ptr = 2829 kmem_alloc(INET6_ADDRSTRLEN, KM_SLEEP); 2830 2831 cm_entry->x_ksp->ks_update = conn_kstat_update; 2832 kstat_install(cm_entry->x_ksp); 2833 return (TRUE); 2834 } 2835 2836 /* 2837 * Verify that the specified offset falls within the mblk and 2838 * that the resulting pointer is aligned. 2839 * Returns NULL if not. 2840 * 2841 * code from fs/sockfs/socksubr.c 2842 */ 2843 static void * 2844 connmgr_opt_getoff(mblk_t *mp, t_uscalar_t offset, 2845 t_uscalar_t length, uint_t align_size) 2846 { 2847 uintptr_t ptr1, ptr2; 2848 2849 ASSERT(mp && mp->b_wptr >= mp->b_rptr); 2850 ptr1 = (uintptr_t)mp->b_rptr + offset; 2851 ptr2 = (uintptr_t)ptr1 + length; 2852 if (ptr1 < (uintptr_t)mp->b_rptr || ptr2 > (uintptr_t)mp->b_wptr) { 2853 return (NULL); 2854 } 2855 if ((ptr1 & (align_size - 1)) != 0) { 2856 return (NULL); 2857 } 2858 return ((void *)ptr1); 2859 } 2860 2861 static bool_t 2862 connmgr_getopt_int(queue_t *wq, int level, int name, int *val, 2863 calllist_t *e, cred_t *cr) 2864 { 2865 mblk_t *mp; 2866 struct opthdr *opt, *opt_res; 2867 struct T_optmgmt_req *tor; 2868 struct T_optmgmt_ack *opt_ack; 2869 struct timeval waitp; 2870 int error; 2871 2872 mp = allocb_cred(sizeof (struct T_optmgmt_req) + 2873 sizeof (struct opthdr) + sizeof (int), cr, NOPID); 2874 if (mp == NULL) 2875 return (FALSE); 2876 2877 mp->b_datap->db_type = M_PROTO; 2878 tor = (struct T_optmgmt_req *)(mp->b_rptr); 2879 tor->PRIM_type = T_SVR4_OPTMGMT_REQ; 2880 tor->MGMT_flags = T_CURRENT; 2881 tor->OPT_length = sizeof (struct opthdr) + sizeof (int); 2882 tor->OPT_offset = sizeof (struct T_optmgmt_req); 2883 2884 opt = (struct opthdr *)(mp->b_rptr + sizeof (struct T_optmgmt_req)); 2885 opt->level = level; 2886 opt->name = name; 2887 opt->len = sizeof (int); 2888 mp->b_wptr += sizeof (struct T_optmgmt_req) + sizeof (struct opthdr) + 2889 sizeof (int); 2890 2891 /* 2892 * We will use this connection regardless 2893 * of whether or not the option is readable. 2894 */ 2895 if (clnt_dispatch_send(wq, mp, e, 0, 0) != RPC_SUCCESS) { 2896 DTRACE_PROBE(krpc__e__connmgr__getopt__cantsend); 2897 freemsg(mp); 2898 return (FALSE); 2899 } 2900 2901 mutex_enter(&clnt_pending_lock); 2902 2903 waitp.tv_sec = clnt_cots_min_conntout; 2904 waitp.tv_usec = 0; 2905 error = waitforack(e, T_OPTMGMT_ACK, &waitp, 1); 2906 2907 if (e->call_prev) 2908 e->call_prev->call_next = e->call_next; 2909 else 2910 clnt_pending = e->call_next; 2911 if (e->call_next) 2912 e->call_next->call_prev = e->call_prev; 2913 mutex_exit(&clnt_pending_lock); 2914 2915 /* get reply message */ 2916 mp = e->call_reply; 2917 e->call_reply = NULL; 2918 2919 if ((!mp) || (e->call_status != RPC_SUCCESS) || (error != 0)) { 2920 2921 DTRACE_PROBE4(krpc__e__connmgr_getopt, int, name, 2922 int, e->call_status, int, error, mblk_t *, mp); 2923 2924 if (mp) 2925 freemsg(mp); 2926 return (FALSE); 2927 } 2928 2929 opt_ack = (struct T_optmgmt_ack *)mp->b_rptr; 2930 opt_res = (struct opthdr *)connmgr_opt_getoff(mp, opt_ack->OPT_offset, 2931 opt_ack->OPT_length, __TPI_ALIGN_SIZE); 2932 2933 if (!opt_res) { 2934 DTRACE_PROBE4(krpc__e__connmgr_optres, mblk_t *, mp, int, name, 2935 int, opt_ack->OPT_offset, int, opt_ack->OPT_length); 2936 freemsg(mp); 2937 return (FALSE); 2938 } 2939 *val = *(int *)&opt_res[1]; 2940 2941 DTRACE_PROBE2(connmgr_getopt__ok, int, name, int, *val); 2942 2943 freemsg(mp); 2944 return (TRUE); 2945 } 2946 2947 /* 2948 * Called by connmgr_connect to set an option on the new stream. 2949 */ 2950 static bool_t 2951 connmgr_setopt_int(queue_t *wq, int level, int name, int val, 2952 calllist_t *e, cred_t *cr) 2953 { 2954 mblk_t *mp; 2955 struct opthdr *opt; 2956 struct T_optmgmt_req *tor; 2957 struct timeval waitp; 2958 int error; 2959 2960 mp = allocb_cred(sizeof (struct T_optmgmt_req) + 2961 sizeof (struct opthdr) + sizeof (int), cr, NOPID); 2962 if (mp == NULL) { 2963 RPCLOG0(1, "connmgr_setopt: cannot alloc mp for option " 2964 "request\n"); 2965 return (FALSE); 2966 } 2967 2968 mp->b_datap->db_type = M_PROTO; 2969 tor = (struct T_optmgmt_req *)(mp->b_rptr); 2970 tor->PRIM_type = T_SVR4_OPTMGMT_REQ; 2971 tor->MGMT_flags = T_NEGOTIATE; 2972 tor->OPT_length = sizeof (struct opthdr) + sizeof (int); 2973 tor->OPT_offset = sizeof (struct T_optmgmt_req); 2974 2975 opt = (struct opthdr *)(mp->b_rptr + sizeof (struct T_optmgmt_req)); 2976 opt->level = level; 2977 opt->name = name; 2978 opt->len = sizeof (int); 2979 *(int *)((char *)opt + sizeof (*opt)) = val; 2980 mp->b_wptr += sizeof (struct T_optmgmt_req) + sizeof (struct opthdr) + 2981 sizeof (int); 2982 2983 /* 2984 * We will use this connection regardless 2985 * of whether or not the option is settable. 2986 */ 2987 if (clnt_dispatch_send(wq, mp, e, 0, 0) != RPC_SUCCESS) { 2988 DTRACE_PROBE(krpc__e__connmgr__setopt__cantsend); 2989 freemsg(mp); 2990 return (FALSE); 2991 } 2992 2993 mutex_enter(&clnt_pending_lock); 2994 2995 waitp.tv_sec = clnt_cots_min_conntout; 2996 waitp.tv_usec = 0; 2997 error = waitforack(e, T_OPTMGMT_ACK, &waitp, 1); 2998 2999 if (e->call_prev) 3000 e->call_prev->call_next = e->call_next; 3001 else 3002 clnt_pending = e->call_next; 3003 if (e->call_next) 3004 e->call_next->call_prev = e->call_prev; 3005 mutex_exit(&clnt_pending_lock); 3006 3007 if (e->call_reply != NULL) { 3008 freemsg(e->call_reply); 3009 e->call_reply = NULL; 3010 } 3011 3012 if (e->call_status != RPC_SUCCESS || error != 0) { 3013 RPCLOG(1, "connmgr_setopt: can't set option: %d\n", name); 3014 return (FALSE); 3015 } 3016 RPCLOG(8, "connmgr_setopt: successfully set option: %d\n", name); 3017 return (TRUE); 3018 } 3019 3020 static bool_t 3021 connmgr_setopt(queue_t *wq, int level, int name, calllist_t *e, cred_t *cr) 3022 { 3023 return (connmgr_setopt_int(wq, level, name, 1, e, cr)); 3024 } 3025 3026 #ifdef DEBUG 3027 3028 /* 3029 * This is a knob to let us force code coverage in allocation failure 3030 * case. 3031 */ 3032 static int connmgr_failsnd; 3033 #define CONN_SND_ALLOC(Size, Pri) \ 3034 ((connmgr_failsnd-- > 0) ? NULL : allocb(Size, Pri)) 3035 3036 #else 3037 3038 #define CONN_SND_ALLOC(Size, Pri) allocb(Size, Pri) 3039 3040 #endif 3041 3042 /* 3043 * Sends an orderly release on the specified queue. 3044 * Entered with connmgr_lock. Exited without connmgr_lock 3045 */ 3046 static void 3047 connmgr_sndrel(struct cm_xprt *cm_entry) 3048 { 3049 struct T_ordrel_req *torr; 3050 mblk_t *mp; 3051 queue_t *q = cm_entry->x_wq; 3052 ASSERT(MUTEX_HELD(&connmgr_lock)); 3053 mp = CONN_SND_ALLOC(sizeof (struct T_ordrel_req), BPRI_LO); 3054 if (mp == NULL) { 3055 cm_entry->x_needrel = TRUE; 3056 mutex_exit(&connmgr_lock); 3057 RPCLOG(1, "connmgr_sndrel: cannot alloc mp for sending ordrel " 3058 "to queue %p\n", (void *)q); 3059 return; 3060 } 3061 mutex_exit(&connmgr_lock); 3062 3063 mp->b_datap->db_type = M_PROTO; 3064 torr = (struct T_ordrel_req *)(mp->b_rptr); 3065 torr->PRIM_type = T_ORDREL_REQ; 3066 mp->b_wptr = mp->b_rptr + sizeof (struct T_ordrel_req); 3067 3068 RPCLOG(8, "connmgr_sndrel: sending ordrel to queue %p\n", (void *)q); 3069 put(q, mp); 3070 } 3071 3072 /* 3073 * Sends an disconnect on the specified queue. 3074 * Entered with connmgr_lock. Exited without connmgr_lock 3075 */ 3076 static void 3077 connmgr_snddis(struct cm_xprt *cm_entry) 3078 { 3079 struct T_discon_req *tdis; 3080 mblk_t *mp; 3081 queue_t *q = cm_entry->x_wq; 3082 3083 ASSERT(MUTEX_HELD(&connmgr_lock)); 3084 mp = CONN_SND_ALLOC(sizeof (*tdis), BPRI_LO); 3085 if (mp == NULL) { 3086 cm_entry->x_needdis = TRUE; 3087 mutex_exit(&connmgr_lock); 3088 RPCLOG(1, "connmgr_snddis: cannot alloc mp for sending discon " 3089 "to queue %p\n", (void *)q); 3090 return; 3091 } 3092 mutex_exit(&connmgr_lock); 3093 3094 mp->b_datap->db_type = M_PROTO; 3095 tdis = (struct T_discon_req *)mp->b_rptr; 3096 tdis->PRIM_type = T_DISCON_REQ; 3097 mp->b_wptr = mp->b_rptr + sizeof (*tdis); 3098 3099 RPCLOG(8, "connmgr_snddis: sending discon to queue %p\n", (void *)q); 3100 put(q, mp); 3101 } 3102 3103 /* 3104 * Sets up the entry for receiving replies, and calls rpcmod's write put proc 3105 * (through put) to send the call. 3106 */ 3107 static int 3108 clnt_dispatch_send(queue_t *q, mblk_t *mp, calllist_t *e, uint_t xid, 3109 uint_t queue_flag) 3110 { 3111 ASSERT(e != NULL); 3112 3113 e->call_status = RPC_TIMEDOUT; /* optimistic, eh? */ 3114 e->call_reason = 0; 3115 e->call_wq = q; 3116 e->call_xid = xid; 3117 e->call_notified = FALSE; 3118 3119 if (!canput(q)) { 3120 e->call_status = RPC_CANTSEND; 3121 e->call_reason = ENOBUFS; 3122 return (RPC_CANTSEND); 3123 } 3124 3125 /* 3126 * If queue_flag is set then the calllist_t is already on the hash 3127 * queue. In this case just send the message and return. 3128 */ 3129 if (queue_flag) { 3130 put(q, mp); 3131 return (RPC_SUCCESS); 3132 3133 } 3134 3135 /* 3136 * Set up calls for RPC requests (with XID != 0) on the hash 3137 * queue for fast lookups and place other calls (i.e. 3138 * connection management) on the linked list. 3139 */ 3140 if (xid != 0) { 3141 RPCLOG(64, "clnt_dispatch_send: putting xid 0x%x on " 3142 "dispatch list\n", xid); 3143 e->call_hash = call_hash(xid, clnt_cots_hash_size); 3144 e->call_bucket = &cots_call_ht[e->call_hash]; 3145 call_table_enter(e); 3146 } else { 3147 mutex_enter(&clnt_pending_lock); 3148 if (clnt_pending) 3149 clnt_pending->call_prev = e; 3150 e->call_next = clnt_pending; 3151 e->call_prev = NULL; 3152 clnt_pending = e; 3153 mutex_exit(&clnt_pending_lock); 3154 } 3155 3156 put(q, mp); 3157 return (RPC_SUCCESS); 3158 } 3159 3160 /* 3161 * Called by rpcmod to notify a client with a clnt_pending call that its reply 3162 * has arrived. If we can't find a client waiting for this reply, we log 3163 * the error and return. 3164 */ 3165 bool_t 3166 clnt_dispatch_notify(mblk_t *mp, zoneid_t zoneid) 3167 { 3168 calllist_t *e = NULL; 3169 call_table_t *chtp; 3170 uint32_t xid; 3171 uint_t hash; 3172 3173 if ((IS_P2ALIGNED(mp->b_rptr, sizeof (uint32_t))) && 3174 (mp->b_wptr - mp->b_rptr) >= sizeof (xid)) 3175 xid = *((uint32_t *)mp->b_rptr); 3176 else { 3177 int i = 0; 3178 unsigned char *p = (unsigned char *)&xid; 3179 unsigned char *rptr; 3180 mblk_t *tmp = mp; 3181 3182 /* 3183 * Copy the xid, byte-by-byte into xid. 3184 */ 3185 while (tmp) { 3186 rptr = tmp->b_rptr; 3187 while (rptr < tmp->b_wptr) { 3188 *p++ = *rptr++; 3189 if (++i >= sizeof (xid)) 3190 goto done_xid_copy; 3191 } 3192 tmp = tmp->b_cont; 3193 } 3194 3195 /* 3196 * If we got here, we ran out of mblk space before the 3197 * xid could be copied. 3198 */ 3199 ASSERT(tmp == NULL && i < sizeof (xid)); 3200 3201 RPCLOG0(1, 3202 "clnt_dispatch_notify: message less than size of xid\n"); 3203 return (FALSE); 3204 3205 } 3206 done_xid_copy: 3207 3208 hash = call_hash(xid, clnt_cots_hash_size); 3209 chtp = &cots_call_ht[hash]; 3210 /* call_table_find returns with the hash bucket locked */ 3211 call_table_find(chtp, xid, e); 3212 3213 if (e != NULL) { 3214 /* 3215 * Found thread waiting for this reply 3216 */ 3217 mutex_enter(&e->call_lock); 3218 3219 /* 3220 * verify that the reply is coming in on 3221 * the same zone that it was sent from. 3222 */ 3223 if (e->call_zoneid != zoneid) { 3224 mutex_exit(&e->call_lock); 3225 mutex_exit(&chtp->ct_lock); 3226 RPCLOG0(1, "clnt_dispatch_notify: incorrect zoneid\n"); 3227 return (FALSE); 3228 } 3229 3230 if (e->call_reply) 3231 /* 3232 * This can happen under the following scenario: 3233 * clnt_cots_kcallit() times out on the response, 3234 * rfscall() repeats the CLNT_CALL() with 3235 * the same xid, clnt_cots_kcallit() sends the retry, 3236 * thereby putting the clnt handle on the pending list, 3237 * the first response arrives, signalling the thread 3238 * in clnt_cots_kcallit(). Before that thread is 3239 * dispatched, the second response arrives as well, 3240 * and clnt_dispatch_notify still finds the handle on 3241 * the pending list, with call_reply set. So free the 3242 * old reply now. 3243 * 3244 * It is also possible for a response intended for 3245 * an RPC call with a different xid to reside here. 3246 * This can happen if the thread that owned this 3247 * client handle prior to the current owner bailed 3248 * out and left its call record on the dispatch 3249 * queue. A window exists where the response can 3250 * arrive before the current owner dispatches its 3251 * RPC call. 3252 * 3253 * In any case, this is the very last point where we 3254 * can safely check the call_reply field before 3255 * placing the new response there. 3256 */ 3257 freemsg(e->call_reply); 3258 e->call_reply = mp; 3259 e->call_status = RPC_SUCCESS; 3260 e->call_notified = TRUE; 3261 cv_signal(&e->call_cv); 3262 mutex_exit(&e->call_lock); 3263 mutex_exit(&chtp->ct_lock); 3264 return (TRUE); 3265 } else { 3266 zone_t *zone; 3267 struct rpcstat *rpcstat; 3268 3269 mutex_exit(&chtp->ct_lock); 3270 RPCLOG(65, "clnt_dispatch_notify: no caller for reply 0x%x\n", 3271 xid); 3272 /* 3273 * This is unfortunate, but we need to lookup the zone so we 3274 * can increment its "rcbadxids" counter. 3275 */ 3276 zone = zone_find_by_id(zoneid); 3277 if (zone == NULL) { 3278 /* 3279 * The zone went away... 3280 */ 3281 return (FALSE); 3282 } 3283 rpcstat = zone_getspecific(rpcstat_zone_key, zone); 3284 if (zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN) { 3285 /* 3286 * Not interested 3287 */ 3288 zone_rele(zone); 3289 return (FALSE); 3290 } 3291 COTSRCSTAT_INCR(rpcstat->rpc_cots_client, rcbadxids); 3292 zone_rele(zone); 3293 } 3294 return (FALSE); 3295 } 3296 3297 /* 3298 * Called by rpcmod when a non-data indication arrives. The ones in which we 3299 * are interested are connection indications and options acks. We dispatch 3300 * based on the queue the indication came in on. If we are not interested in 3301 * what came in, we return false to rpcmod, who will then pass it upstream. 3302 */ 3303 bool_t 3304 clnt_dispatch_notifyconn(queue_t *q, mblk_t *mp) 3305 { 3306 calllist_t *e; 3307 int type; 3308 3309 ASSERT((q->q_flag & QREADR) == 0); 3310 3311 type = ((union T_primitives *)mp->b_rptr)->type; 3312 RPCLOG(8, "clnt_dispatch_notifyconn: prim type: [%s]\n", 3313 rpc_tpiprim2name(type)); 3314 mutex_enter(&clnt_pending_lock); 3315 for (e = clnt_pending; /* NO CONDITION */; e = e->call_next) { 3316 if (e == NULL) { 3317 mutex_exit(&clnt_pending_lock); 3318 RPCLOG(1, "clnt_dispatch_notifyconn: no one waiting " 3319 "for connection on queue 0x%p\n", (void *)q); 3320 return (FALSE); 3321 } 3322 if (e->call_wq == q) 3323 break; 3324 } 3325 3326 switch (type) { 3327 case T_CONN_CON: 3328 /* 3329 * The transport is now connected, send a T_INFO_REQ to get 3330 * the tidu size. 3331 */ 3332 mutex_exit(&clnt_pending_lock); 3333 ASSERT(mp->b_datap->db_lim - mp->b_datap->db_base >= 3334 sizeof (struct T_info_req)); 3335 mp->b_rptr = mp->b_datap->db_base; 3336 ((union T_primitives *)mp->b_rptr)->type = T_INFO_REQ; 3337 mp->b_wptr = mp->b_rptr + sizeof (struct T_info_req); 3338 mp->b_datap->db_type = M_PCPROTO; 3339 put(q, mp); 3340 return (TRUE); 3341 case T_INFO_ACK: 3342 case T_OPTMGMT_ACK: 3343 e->call_status = RPC_SUCCESS; 3344 e->call_reply = mp; 3345 e->call_notified = TRUE; 3346 cv_signal(&e->call_cv); 3347 break; 3348 case T_ERROR_ACK: 3349 e->call_status = RPC_CANTCONNECT; 3350 e->call_reply = mp; 3351 e->call_notified = TRUE; 3352 cv_signal(&e->call_cv); 3353 break; 3354 case T_OK_ACK: 3355 /* 3356 * Great, but we are really waiting for a T_CONN_CON 3357 */ 3358 freemsg(mp); 3359 break; 3360 default: 3361 mutex_exit(&clnt_pending_lock); 3362 RPCLOG(1, "clnt_dispatch_notifyconn: bad type %d\n", type); 3363 return (FALSE); 3364 } 3365 3366 mutex_exit(&clnt_pending_lock); 3367 return (TRUE); 3368 } 3369 3370 /* 3371 * Called by rpcmod when the transport is (or should be) going away. Informs 3372 * all callers waiting for replies and marks the entry in the connection 3373 * manager's list as unconnected, and either closing (close handshake in 3374 * progress) or dead. 3375 */ 3376 void 3377 clnt_dispatch_notifyall(queue_t *q, int32_t msg_type, int32_t reason) 3378 { 3379 calllist_t *e; 3380 call_table_t *ctp; 3381 struct cm_xprt *cm_entry; 3382 int have_connmgr_lock; 3383 int i; 3384 3385 ASSERT((q->q_flag & QREADR) == 0); 3386 3387 RPCLOG(1, "clnt_dispatch_notifyall on queue %p", (void *)q); 3388 RPCLOG(1, " received a notifcation prim type [%s]", 3389 rpc_tpiprim2name(msg_type)); 3390 RPCLOG(1, " and reason %d\n", reason); 3391 3392 /* 3393 * Find the transport entry in the connection manager's list, close 3394 * the transport and delete the entry. In the case where rpcmod's 3395 * idle timer goes off, it sends us a T_ORDREL_REQ, indicating we 3396 * should gracefully close the connection. 3397 */ 3398 have_connmgr_lock = 1; 3399 mutex_enter(&connmgr_lock); 3400 for (cm_entry = cm_hd; cm_entry; cm_entry = cm_entry->x_next) { 3401 ASSERT(cm_entry != cm_entry->x_next); 3402 if (cm_entry->x_wq == q) { 3403 ASSERT(MUTEX_HELD(&connmgr_lock)); 3404 ASSERT(have_connmgr_lock == 1); 3405 switch (msg_type) { 3406 case T_ORDREL_REQ: 3407 3408 if (cm_entry->x_dead) { 3409 RPCLOG(1, "idle timeout on dead " 3410 "connection: %p\n", 3411 (void *)cm_entry); 3412 if (clnt_stop_idle != NULL) 3413 (*clnt_stop_idle)(q); 3414 break; 3415 } 3416 3417 /* 3418 * Only mark the connection as dead if it is 3419 * connected and idle. 3420 * An unconnected connection has probably 3421 * gone idle because the server is down, 3422 * and when it comes back up there will be 3423 * retries that need to use that connection. 3424 */ 3425 if (cm_entry->x_connected || 3426 cm_entry->x_doomed) { 3427 if (cm_entry->x_ordrel) { 3428 if (cm_entry->x_closing == 3429 TRUE) { 3430 /* 3431 * The connection is 3432 * obviously wedged due 3433 * to a bug or problem 3434 * with the transport. 3435 * Mark it as dead. 3436 * Otherwise we can 3437 * leak connections. 3438 */ 3439 cm_entry->x_dead = TRUE; 3440 mutex_exit( 3441 &connmgr_lock); 3442 have_connmgr_lock = 0; 3443 if (clnt_stop_idle != 3444 NULL) 3445 (*clnt_stop_idle)(q); 3446 break; 3447 } 3448 cm_entry->x_closing = TRUE; 3449 connmgr_sndrel(cm_entry); 3450 have_connmgr_lock = 0; 3451 } else { 3452 cm_entry->x_dead = TRUE; 3453 mutex_exit(&connmgr_lock); 3454 have_connmgr_lock = 0; 3455 if (clnt_stop_idle != NULL) 3456 (*clnt_stop_idle)(q); 3457 } 3458 } else { 3459 /* 3460 * We don't mark the connection 3461 * as dead, but we turn off the 3462 * idle timer. 3463 */ 3464 mutex_exit(&connmgr_lock); 3465 have_connmgr_lock = 0; 3466 if (clnt_stop_idle != NULL) 3467 (*clnt_stop_idle)(q); 3468 RPCLOG(1, "clnt_dispatch_notifyall:" 3469 " ignoring timeout from rpcmod" 3470 " (q %p) because we are not " 3471 " connected\n", (void *)q); 3472 } 3473 break; 3474 case T_ORDREL_IND: 3475 /* 3476 * If this entry is marked closing, then we are 3477 * completing a close handshake, and the 3478 * connection is dead. Otherwise, the server is 3479 * trying to close. Since the server will not 3480 * be sending any more RPC replies, we abort 3481 * the connection, including flushing 3482 * any RPC requests that are in-transit. 3483 * In either case, mark the entry as dead so 3484 * that it can be closed by the connection 3485 * manager's garbage collector. 3486 */ 3487 cm_entry->x_dead = TRUE; 3488 if (cm_entry->x_closing) { 3489 mutex_exit(&connmgr_lock); 3490 have_connmgr_lock = 0; 3491 if (clnt_stop_idle != NULL) 3492 (*clnt_stop_idle)(q); 3493 } else { 3494 /* 3495 * if we're getting a disconnect 3496 * before we've finished our 3497 * connect attempt, mark it for 3498 * later processing 3499 */ 3500 if (cm_entry->x_thread) 3501 cm_entry->x_early_disc = TRUE; 3502 else 3503 cm_entry->x_connected = FALSE; 3504 cm_entry->x_waitdis = TRUE; 3505 connmgr_snddis(cm_entry); 3506 have_connmgr_lock = 0; 3507 } 3508 break; 3509 3510 case T_ERROR_ACK: 3511 case T_OK_ACK: 3512 cm_entry->x_waitdis = FALSE; 3513 cv_signal(&cm_entry->x_dis_cv); 3514 mutex_exit(&connmgr_lock); 3515 return; 3516 3517 case T_DISCON_REQ: 3518 if (cm_entry->x_thread) 3519 cm_entry->x_early_disc = TRUE; 3520 else 3521 cm_entry->x_connected = FALSE; 3522 cm_entry->x_waitdis = TRUE; 3523 3524 connmgr_snddis(cm_entry); 3525 have_connmgr_lock = 0; 3526 break; 3527 3528 case T_DISCON_IND: 3529 default: 3530 /* 3531 * if we're getting a disconnect before 3532 * we've finished our connect attempt, 3533 * mark it for later processing 3534 */ 3535 if (cm_entry->x_closing) { 3536 cm_entry->x_dead = TRUE; 3537 mutex_exit(&connmgr_lock); 3538 have_connmgr_lock = 0; 3539 if (clnt_stop_idle != NULL) 3540 (*clnt_stop_idle)(q); 3541 } else { 3542 if (cm_entry->x_thread) { 3543 cm_entry->x_early_disc = TRUE; 3544 } else { 3545 cm_entry->x_dead = TRUE; 3546 cm_entry->x_connected = FALSE; 3547 } 3548 } 3549 break; 3550 } 3551 break; 3552 } 3553 } 3554 3555 if (have_connmgr_lock) 3556 mutex_exit(&connmgr_lock); 3557 3558 if (msg_type == T_ERROR_ACK || msg_type == T_OK_ACK) { 3559 RPCLOG(1, "clnt_dispatch_notifyall: (wq %p) could not find " 3560 "connmgr entry for discon ack\n", (void *)q); 3561 return; 3562 } 3563 3564 /* 3565 * Then kick all the clnt_pending calls out of their wait. There 3566 * should be no clnt_pending calls in the case of rpcmod's idle 3567 * timer firing. 3568 */ 3569 for (i = 0; i < clnt_cots_hash_size; i++) { 3570 ctp = &cots_call_ht[i]; 3571 mutex_enter(&ctp->ct_lock); 3572 for (e = ctp->ct_call_next; 3573 e != (calllist_t *)ctp; 3574 e = e->call_next) { 3575 if (e->call_wq == q && e->call_notified == FALSE) { 3576 RPCLOG(1, 3577 "clnt_dispatch_notifyall for queue %p ", 3578 (void *)q); 3579 RPCLOG(1, "aborting clnt_pending call %p\n", 3580 (void *)e); 3581 3582 if (msg_type == T_DISCON_IND) 3583 e->call_reason = reason; 3584 e->call_notified = TRUE; 3585 e->call_status = RPC_XPRTFAILED; 3586 cv_signal(&e->call_cv); 3587 } 3588 } 3589 mutex_exit(&ctp->ct_lock); 3590 } 3591 3592 mutex_enter(&clnt_pending_lock); 3593 for (e = clnt_pending; e; e = e->call_next) { 3594 /* 3595 * Only signal those RPC handles that haven't been 3596 * signalled yet. Otherwise we can get a bogus call_reason. 3597 * This can happen if thread A is making a call over a 3598 * connection. If the server is killed, it will cause 3599 * reset, and reason will default to EIO as a result of 3600 * a T_ORDREL_IND. Thread B then attempts to recreate 3601 * the connection but gets a T_DISCON_IND. If we set the 3602 * call_reason code for all threads, then if thread A 3603 * hasn't been dispatched yet, it will get the wrong 3604 * reason. The bogus call_reason can make it harder to 3605 * discriminate between calls that fail because the 3606 * connection attempt failed versus those where the call 3607 * may have been executed on the server. 3608 */ 3609 if (e->call_wq == q && e->call_notified == FALSE) { 3610 RPCLOG(1, "clnt_dispatch_notifyall for queue %p ", 3611 (void *)q); 3612 RPCLOG(1, " aborting clnt_pending call %p\n", 3613 (void *)e); 3614 3615 if (msg_type == T_DISCON_IND) 3616 e->call_reason = reason; 3617 e->call_notified = TRUE; 3618 /* 3619 * Let the caller timeout, else it will retry 3620 * immediately. 3621 */ 3622 e->call_status = RPC_XPRTFAILED; 3623 3624 /* 3625 * We used to just signal those threads 3626 * waiting for a connection, (call_xid = 0). 3627 * That meant that threads waiting for a response 3628 * waited till their timeout expired. This 3629 * could be a long time if they've specified a 3630 * maximum timeout. (2^31 - 1). So we 3631 * Signal all threads now. 3632 */ 3633 cv_signal(&e->call_cv); 3634 } 3635 } 3636 mutex_exit(&clnt_pending_lock); 3637 } 3638 3639 3640 /*ARGSUSED*/ 3641 /* 3642 * after resuming a system that's been suspended for longer than the 3643 * NFS server's idle timeout (svc_idle_timeout for Solaris 2), rfscall() 3644 * generates "NFS server X not responding" and "NFS server X ok" messages; 3645 * here we reset inet connections to cause a re-connect and avoid those 3646 * NFS messages. see 4045054 3647 */ 3648 boolean_t 3649 connmgr_cpr_reset(void *arg, int code) 3650 { 3651 struct cm_xprt *cxp; 3652 3653 if (code == CB_CODE_CPR_CHKPT) 3654 return (B_TRUE); 3655 3656 if (mutex_tryenter(&connmgr_lock) == 0) 3657 return (B_FALSE); 3658 for (cxp = cm_hd; cxp; cxp = cxp->x_next) { 3659 if ((cxp->x_family == AF_INET || cxp->x_family == AF_INET6) && 3660 cxp->x_connected == TRUE) { 3661 if (cxp->x_thread) 3662 cxp->x_early_disc = TRUE; 3663 else 3664 cxp->x_connected = FALSE; 3665 cxp->x_needdis = TRUE; 3666 } 3667 } 3668 mutex_exit(&connmgr_lock); 3669 return (B_TRUE); 3670 } 3671 3672 void 3673 clnt_cots_stats_init(zoneid_t zoneid, struct rpc_cots_client **statsp) 3674 { 3675 3676 *statsp = (struct rpc_cots_client *)rpcstat_zone_init_common(zoneid, 3677 "unix", "rpc_cots_client", (const kstat_named_t *)&cots_rcstat_tmpl, 3678 sizeof (cots_rcstat_tmpl)); 3679 } 3680 3681 void 3682 clnt_cots_stats_fini(zoneid_t zoneid, struct rpc_cots_client **statsp) 3683 { 3684 rpcstat_zone_fini_common(zoneid, "unix", "rpc_cots_client"); 3685 kmem_free(*statsp, sizeof (cots_rcstat_tmpl)); 3686 } 3687 3688 void 3689 clnt_cots_init(void) 3690 { 3691 mutex_init(&connmgr_lock, NULL, MUTEX_DEFAULT, NULL); 3692 mutex_init(&clnt_pending_lock, NULL, MUTEX_DEFAULT, NULL); 3693 3694 if (clnt_cots_hash_size < DEFAULT_MIN_HASH_SIZE) 3695 clnt_cots_hash_size = DEFAULT_MIN_HASH_SIZE; 3696 3697 cots_call_ht = call_table_init(clnt_cots_hash_size); 3698 zone_key_create(&zone_cots_key, NULL, NULL, clnt_zone_destroy); 3699 } 3700 3701 void 3702 clnt_cots_fini(void) 3703 { 3704 (void) zone_key_delete(zone_cots_key); 3705 } 3706 3707 /* 3708 * Wait for TPI ack, returns success only if expected ack is received 3709 * within timeout period. 3710 */ 3711 3712 static int 3713 waitforack(calllist_t *e, t_scalar_t ack_prim, const struct timeval *waitp, 3714 bool_t nosignal) 3715 { 3716 union T_primitives *tpr; 3717 clock_t timout; 3718 int cv_stat = 1; 3719 3720 ASSERT(MUTEX_HELD(&clnt_pending_lock)); 3721 while (e->call_reply == NULL) { 3722 if (waitp != NULL) { 3723 timout = waitp->tv_sec * drv_usectohz(MICROSEC) + 3724 drv_usectohz(waitp->tv_usec); 3725 if (nosignal) 3726 cv_stat = cv_reltimedwait(&e->call_cv, 3727 &clnt_pending_lock, timout, TR_CLOCK_TICK); 3728 else 3729 cv_stat = cv_reltimedwait_sig(&e->call_cv, 3730 &clnt_pending_lock, timout, TR_CLOCK_TICK); 3731 } else { 3732 if (nosignal) 3733 cv_wait(&e->call_cv, &clnt_pending_lock); 3734 else 3735 cv_stat = cv_wait_sig(&e->call_cv, 3736 &clnt_pending_lock); 3737 } 3738 if (cv_stat == -1) 3739 return (ETIME); 3740 if (cv_stat == 0) 3741 return (EINTR); 3742 /* 3743 * if we received an error from the server and we know a reply 3744 * is not going to be sent, do not wait for the full timeout, 3745 * return now. 3746 */ 3747 if (e->call_status == RPC_XPRTFAILED) 3748 return (e->call_reason); 3749 } 3750 tpr = (union T_primitives *)e->call_reply->b_rptr; 3751 if (tpr->type == ack_prim) 3752 return (0); /* Success */ 3753 3754 if (tpr->type == T_ERROR_ACK) { 3755 if (tpr->error_ack.TLI_error == TSYSERR) 3756 return (tpr->error_ack.UNIX_error); 3757 else 3758 return (t_tlitosyserr(tpr->error_ack.TLI_error)); 3759 } 3760 3761 return (EPROTO); /* unknown or unexpected primitive */ 3762 }