1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
24 * Copyright (c) 2016 by Delphix. All rights reserved.
25 */
26
27 /*
28 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
29 * Use is subject to license terms.
30 */
31
32 /*
33 * Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T
34 * All Rights Reserved
35 */
36
37 /*
38 * Portions of this source code were derived from Berkeley 4.3 BSD
39 * under license from the Regents of the University of California.
40 */
41
42
43 /*
44 * Implements a kernel based, client side RPC over Connection Oriented
45 * Transports (COTS).
46 */
47
48 /*
49 * Much of this file has been re-written to let NFS work better over slow
50 * transports. A description follows.
51 *
52 * One of the annoying things about kRPC/COTS is that it will temporarily
53 * create more than one connection between a client and server. This
54 * happens because when a connection is made, the end-points entry in the
55 * linked list of connections (headed by cm_hd), is removed so that other
56 * threads don't mess with it. Went ahead and bit the bullet by keeping
57 * the endpoint on the connection list and introducing state bits,
58 * condition variables etc. to the connection entry data structure (struct
59 * cm_xprt).
60 *
61 * Here is a summary of the changes to cm-xprt:
62 *
63 * x_ctime is the timestamp of when the endpoint was last
64 * connected or disconnected. If an end-point is ever disconnected
65 * or re-connected, then any outstanding RPC request is presumed
66 * lost, telling clnt_cots_kcallit that it needs to re-send the
67 * request, not just wait for the original request's reply to
68 * arrive.
69 *
70 * x_thread flag which tells us if a thread is doing a connection attempt.
71 *
72 * x_waitdis flag which tells us we are waiting a disconnect ACK.
73 *
74 * x_needdis flag which tells us we need to send a T_DISCONN_REQ
75 * to kill the connection.
76 *
77 * x_needrel flag which tells us we need to send a T_ORDREL_REQ to
78 * gracefully close the connection.
79 *
80 * #defined bitmasks for the all the b_* bits so that more
81 * efficient (and at times less clumsy) masks can be used to
82 * manipulated state in cases where multiple bits have to
83 * set/cleared/checked in the same critical section.
84 *
85 * x_conn_cv and x_dis-_cv are new condition variables to let
86 * threads knows when the connection attempt is done, and to let
87 * the connecting thread know when the disconnect handshake is
88 * done.
89 *
90 * Added the CONN_HOLD() macro so that all reference holds have the same
91 * look and feel.
92 *
93 * In the private (cku_private) portion of the client handle,
94 *
95 * cku_flags replaces the cku_sent a boolean. cku_flags keeps
96 * track of whether a request as been sent, and whether the
97 * client's handles call record is on the dispatch list (so that
98 * the reply can be matched by XID to the right client handle).
99 * The idea of CKU_ONQUEUE is that we can exit clnt_cots_kcallit()
100 * and still have the response find the right client handle so
101 * that the retry of CLNT_CALL() gets the result. Testing, found
102 * situations where if the timeout was increased, performance
103 * degraded. This was due to us hitting a window where the thread
104 * was back in rfscall() (probably printing server not responding)
105 * while the response came back but no place to put it.
106 *
107 * cku_ctime is just a cache of x_ctime. If they match,
108 * clnt_cots_kcallit() won't to send a retry (unless the maximum
109 * receive count limit as been reached). If the don't match, then
110 * we assume the request has been lost, and a retry of the request
111 * is needed.
112 *
113 * cku_recv_attempts counts the number of receive count attempts
114 * after one try is sent on the wire.
115 *
116 * Added the clnt_delay() routine so that interruptible and
117 * noninterruptible delays are possible.
118 *
119 * CLNT_MIN_TIMEOUT has been bumped to 10 seconds from 3. This is used to
120 * control how long the client delays before returned after getting
121 * ECONNREFUSED. At 3 seconds, 8 client threads per mount really does bash
122 * a server that may be booting and not yet started nfsd.
123 *
124 * CLNT_MAXRECV_WITHOUT_RETRY is a new macro (value of 3) (with a tunable)
125 * Why don't we just wait forever (receive an infinite # of times)?
126 * Because the server may have rebooted. More insidious is that some
127 * servers (ours) will drop NFS/TCP requests in some cases. This is bad,
128 * but it is a reality.
129 *
130 * The case of a server doing orderly release really messes up the
131 * client's recovery, especially if the server's TCP implementation is
132 * buggy. It was found was that the kRPC/COTS client was breaking some
133 * TPI rules, such as not waiting for the acknowledgement of a
134 * T_DISCON_REQ (hence the added case statements T_ERROR_ACK, T_OK_ACK and
135 * T_DISCON_REQ in clnt_dispatch_notifyall()).
136 *
137 * One of things that we've seen is that a kRPC TCP endpoint goes into
138 * TIMEWAIT and a thus a reconnect takes a long time to satisfy because
139 * that the TIMEWAIT state takes a while to finish. If a server sends a
140 * T_ORDREL_IND, there is little point in an RPC client doing a
141 * T_ORDREL_REQ, because the RPC request isn't going to make it (the
142 * server is saying that it won't accept any more data). So kRPC was
143 * changed to send a T_DISCON_REQ when we get a T_ORDREL_IND. So now the
144 * connection skips the TIMEWAIT state and goes straight to a bound state
145 * that kRPC can quickly switch to connected.
146 *
147 * Code that issues TPI request must use waitforack() to wait for the
148 * corresponding ack (assuming there is one) in any future modifications.
149 * This works around problems that may be introduced by breaking TPI rules
150 * (by submitting new calls before earlier requests have been acked) in the
151 * case of a signal or other early return. waitforack() depends on
152 * clnt_dispatch_notifyconn() to issue the wakeup when the ack
153 * arrives, so adding new TPI calls may require corresponding changes
154 * to clnt_dispatch_notifyconn(). Presently, the timeout period is based on
155 * CLNT_MIN_TIMEOUT which is 10 seconds. If you modify this value, be sure
156 * not to set it too low or TPI ACKS will be lost.
157 */
158
159 #include <sys/param.h>
160 #include <sys/types.h>
161 #include <sys/user.h>
162 #include <sys/systm.h>
163 #include <sys/sysmacros.h>
164 #include <sys/proc.h>
165 #include <sys/socket.h>
166 #include <sys/file.h>
167 #include <sys/stream.h>
168 #include <sys/strsubr.h>
169 #include <sys/stropts.h>
170 #include <sys/strsun.h>
171 #include <sys/timod.h>
172 #include <sys/tiuser.h>
173 #include <sys/tihdr.h>
174 #include <sys/t_kuser.h>
175 #include <sys/fcntl.h>
176 #include <sys/errno.h>
177 #include <sys/kmem.h>
178 #include <sys/debug.h>
179 #include <sys/systm.h>
180 #include <sys/kstat.h>
181 #include <sys/t_lock.h>
182 #include <sys/ddi.h>
183 #include <sys/cmn_err.h>
184 #include <sys/time.h>
185 #include <sys/isa_defs.h>
186 #include <sys/callb.h>
187 #include <sys/sunddi.h>
188 #include <sys/atomic.h>
189 #include <sys/sdt.h>
190
191 #include <netinet/in.h>
192 #include <netinet/tcp.h>
193
194 #include <rpc/types.h>
195 #include <rpc/xdr.h>
196 #include <rpc/auth.h>
197 #include <rpc/clnt.h>
198 #include <rpc/rpc_msg.h>
199
200 #define COTS_DEFAULT_ALLOCSIZE 2048
201
202 #define WIRE_HDR_SIZE 20 /* serialized call header, sans proc number */
203 #define MSG_OFFSET 128 /* offset of call into the mblk */
204
205 const char *kinet_ntop6(uchar_t *, char *, size_t);
206
207 static int clnt_cots_ksettimers(CLIENT *, struct rpc_timers *,
208 struct rpc_timers *, int, void(*)(int, int, caddr_t), caddr_t, uint32_t);
209 static enum clnt_stat clnt_cots_kcallit(CLIENT *, rpcproc_t, xdrproc_t,
210 caddr_t, xdrproc_t, caddr_t, struct timeval);
211 static void clnt_cots_kabort(CLIENT *);
212 static void clnt_cots_kerror(CLIENT *, struct rpc_err *);
213 static bool_t clnt_cots_kfreeres(CLIENT *, xdrproc_t, caddr_t);
214 static void clnt_cots_kdestroy(CLIENT *);
215 static bool_t clnt_cots_kcontrol(CLIENT *, int, char *);
216
217
218 /* List of transports managed by the connection manager. */
219 struct cm_xprt {
220 TIUSER *x_tiptr; /* transport handle */
221 queue_t *x_wq; /* send queue */
222 clock_t x_time; /* last time we handed this xprt out */
223 clock_t x_ctime; /* time we went to CONNECTED */
224 int x_tidu_size; /* TIDU size of this transport */
225 union {
226 struct {
227 unsigned int
228 #ifdef _BIT_FIELDS_HTOL
229 b_closing: 1, /* we've sent a ord rel on this conn */
230 b_dead: 1, /* transport is closed or disconn */
231 b_doomed: 1, /* too many conns, let this go idle */
232 b_connected: 1, /* this connection is connected */
233
234 b_ordrel: 1, /* do an orderly release? */
235 b_thread: 1, /* thread doing connect */
236 b_waitdis: 1, /* waiting for disconnect ACK */
237 b_needdis: 1, /* need T_DISCON_REQ */
238
239 b_needrel: 1, /* need T_ORDREL_REQ */
240 b_early_disc: 1, /* got a T_ORDREL_IND or T_DISCON_IND */
241 /* disconnect during connect */
242
243 b_pad: 22;
244
245 #endif
246
247 #ifdef _BIT_FIELDS_LTOH
248 b_pad: 22,
249
250 b_early_disc: 1, /* got a T_ORDREL_IND or T_DISCON_IND */
251 /* disconnect during connect */
252 b_needrel: 1, /* need T_ORDREL_REQ */
253
254 b_needdis: 1, /* need T_DISCON_REQ */
255 b_waitdis: 1, /* waiting for disconnect ACK */
256 b_thread: 1, /* thread doing connect */
257 b_ordrel: 1, /* do an orderly release? */
258
259 b_connected: 1, /* this connection is connected */
260 b_doomed: 1, /* too many conns, let this go idle */
261 b_dead: 1, /* transport is closed or disconn */
262 b_closing: 1; /* we've sent a ord rel on this conn */
263 #endif
264 } bit; unsigned int word;
265
266 #define x_closing x_state.bit.b_closing
267 #define x_dead x_state.bit.b_dead
268 #define x_doomed x_state.bit.b_doomed
269 #define x_connected x_state.bit.b_connected
270
271 #define x_ordrel x_state.bit.b_ordrel
272 #define x_thread x_state.bit.b_thread
273 #define x_waitdis x_state.bit.b_waitdis
274 #define x_needdis x_state.bit.b_needdis
275
276 #define x_needrel x_state.bit.b_needrel
277 #define x_early_disc x_state.bit.b_early_disc
278
279 #define x_state_flags x_state.word
280
281 #define X_CLOSING 0x80000000
282 #define X_DEAD 0x40000000
283 #define X_DOOMED 0x20000000
284 #define X_CONNECTED 0x10000000
285
286 #define X_ORDREL 0x08000000
287 #define X_THREAD 0x04000000
288 #define X_WAITDIS 0x02000000
289 #define X_NEEDDIS 0x01000000
290
291 #define X_NEEDREL 0x00800000
292 #define X_EARLYDISC 0x00400000
293
294 #define X_BADSTATES (X_CLOSING | X_DEAD | X_DOOMED)
295
296 } x_state;
297 int x_ref; /* number of users of this xprt */
298 int x_family; /* address family of transport */
299 dev_t x_rdev; /* device number of transport */
300 struct cm_xprt *x_next;
301
302 struct netbuf x_server; /* destination address */
303 struct netbuf x_src; /* src address (for retries) */
304 kmutex_t x_lock; /* lock on this entry */
305 kcondvar_t x_cv; /* to signal when can be closed */
306 kcondvar_t x_conn_cv; /* to signal when connection attempt */
307 /* is complete */
308 kstat_t *x_ksp;
309
310 kcondvar_t x_dis_cv; /* to signal when disconnect attempt */
311 /* is complete */
312 zoneid_t x_zoneid; /* zone this xprt belongs to */
313 };
314
315 typedef struct cm_kstat_xprt {
316 kstat_named_t x_wq;
317 kstat_named_t x_server;
318 kstat_named_t x_family;
319 kstat_named_t x_rdev;
320 kstat_named_t x_time;
321 kstat_named_t x_state;
322 kstat_named_t x_ref;
323 kstat_named_t x_port;
324 } cm_kstat_xprt_t;
325
326 static cm_kstat_xprt_t cm_kstat_template = {
327 { "write_queue", KSTAT_DATA_UINT32 },
328 { "server", KSTAT_DATA_STRING },
329 { "addr_family", KSTAT_DATA_UINT32 },
330 { "device", KSTAT_DATA_UINT32 },
331 { "time_stamp", KSTAT_DATA_UINT32 },
332 { "status", KSTAT_DATA_UINT32 },
333 { "ref_count", KSTAT_DATA_INT32 },
334 { "port", KSTAT_DATA_UINT32 },
335 };
336
337 /*
338 * The inverse of this is connmgr_release().
339 */
340 #define CONN_HOLD(Cm_entry) {\
341 mutex_enter(&(Cm_entry)->x_lock); \
342 (Cm_entry)->x_ref++; \
343 mutex_exit(&(Cm_entry)->x_lock); \
344 }
345
346
347 /*
348 * Private data per rpc handle. This structure is allocated by
349 * clnt_cots_kcreate, and freed by clnt_cots_kdestroy.
350 */
351 typedef struct cku_private_s {
352 CLIENT cku_client; /* client handle */
353 calllist_t cku_call; /* for dispatching calls */
354 struct rpc_err cku_err; /* error status */
355
356 struct netbuf cku_srcaddr; /* source address for retries */
357 int cku_addrfmly; /* for binding port */
358 struct netbuf cku_addr; /* remote address */
359 dev_t cku_device; /* device to use */
360 uint_t cku_flags;
361 #define CKU_ONQUEUE 0x1
362 #define CKU_SENT 0x2
363
364 bool_t cku_progress; /* for CLSET_PROGRESS */
365 uint32_t cku_xid; /* current XID */
366 clock_t cku_ctime; /* time stamp of when */
367 /* connection was created */
368 uint_t cku_recv_attempts;
369 XDR cku_outxdr; /* xdr routine for output */
370 XDR cku_inxdr; /* xdr routine for input */
371 char cku_rpchdr[WIRE_HDR_SIZE + 4];
372 /* pre-serialized rpc header */
373
374 uint_t cku_outbuflen; /* default output mblk length */
375 struct cred *cku_cred; /* credentials */
376 bool_t cku_nodelayonerr;
377 /* for CLSET_NODELAYONERR */
378 int cku_useresvport; /* Use reserved port */
379 struct rpc_cots_client *cku_stats; /* stats for zone */
380 } cku_private_t;
381
382 static struct cm_xprt *connmgr_wrapconnect(struct cm_xprt *,
383 const struct timeval *, struct netbuf *, int, struct netbuf *,
384 struct rpc_err *, bool_t, bool_t, cred_t *);
385
386 static bool_t connmgr_connect(struct cm_xprt *, queue_t *, struct netbuf *,
387 int, calllist_t *, int *, bool_t reconnect,
388 const struct timeval *, bool_t, cred_t *);
389
390 static void *connmgr_opt_getoff(mblk_t *mp, t_uscalar_t offset,
391 t_uscalar_t length, uint_t align_size);
392 static bool_t connmgr_setbufsz(calllist_t *e, queue_t *wq, cred_t *cr);
393 static bool_t connmgr_getopt_int(queue_t *wq, int level, int name, int *val,
394 calllist_t *e, cred_t *cr);
395 static bool_t connmgr_setopt_int(queue_t *wq, int level, int name, int val,
396 calllist_t *e, cred_t *cr);
397 static bool_t connmgr_setopt(queue_t *, int, int, calllist_t *, cred_t *cr);
398 static void connmgr_sndrel(struct cm_xprt *);
399 static void connmgr_snddis(struct cm_xprt *);
400 static void connmgr_close(struct cm_xprt *);
401 static void connmgr_release(struct cm_xprt *);
402 static struct cm_xprt *connmgr_wrapget(struct netbuf *, const struct timeval *,
403 cku_private_t *);
404
405 static struct cm_xprt *connmgr_get(struct netbuf *, const struct timeval *,
406 struct netbuf *, int, struct netbuf *, struct rpc_err *, dev_t,
407 bool_t, int, cred_t *);
408
409 static void connmgr_cancelconn(struct cm_xprt *);
410 static enum clnt_stat connmgr_cwait(struct cm_xprt *, const struct timeval *,
411 bool_t);
412 static void connmgr_dis_and_wait(struct cm_xprt *);
413
414 static int clnt_dispatch_send(queue_t *, mblk_t *, calllist_t *, uint_t,
415 uint_t);
416
417 static int clnt_delay(clock_t, bool_t);
418
419 static int waitforack(calllist_t *, t_scalar_t, const struct timeval *, bool_t);
420
421 /*
422 * Operations vector for TCP/IP based RPC
423 */
424 static struct clnt_ops tcp_ops = {
425 clnt_cots_kcallit, /* do rpc call */
426 clnt_cots_kabort, /* abort call */
427 clnt_cots_kerror, /* return error status */
428 clnt_cots_kfreeres, /* free results */
429 clnt_cots_kdestroy, /* destroy rpc handle */
430 clnt_cots_kcontrol, /* the ioctl() of rpc */
431 clnt_cots_ksettimers, /* set retry timers */
432 };
433
434 static int rpc_kstat_instance = 0; /* keeps the current instance */
435 /* number for the next kstat_create */
436
437 static struct cm_xprt *cm_hd = NULL;
438 static kmutex_t connmgr_lock; /* for connection mngr's list of transports */
439
440 extern kmutex_t clnt_max_msg_lock;
441
442 static calllist_t *clnt_pending = NULL;
443 extern kmutex_t clnt_pending_lock;
444
445 static int clnt_cots_hash_size = DEFAULT_HASH_SIZE;
446
447 static call_table_t *cots_call_ht;
448
449 static const struct rpc_cots_client {
450 kstat_named_t rccalls;
451 kstat_named_t rcbadcalls;
452 kstat_named_t rcbadxids;
453 kstat_named_t rctimeouts;
454 kstat_named_t rcnewcreds;
455 kstat_named_t rcbadverfs;
456 kstat_named_t rctimers;
457 kstat_named_t rccantconn;
458 kstat_named_t rcnomem;
459 kstat_named_t rcintrs;
460 } cots_rcstat_tmpl = {
461 { "calls", KSTAT_DATA_UINT64 },
462 { "badcalls", KSTAT_DATA_UINT64 },
463 { "badxids", KSTAT_DATA_UINT64 },
464 { "timeouts", KSTAT_DATA_UINT64 },
465 { "newcreds", KSTAT_DATA_UINT64 },
466 { "badverfs", KSTAT_DATA_UINT64 },
467 { "timers", KSTAT_DATA_UINT64 },
468 { "cantconn", KSTAT_DATA_UINT64 },
469 { "nomem", KSTAT_DATA_UINT64 },
470 { "interrupts", KSTAT_DATA_UINT64 }
471 };
472
473 #define COTSRCSTAT_INCR(p, x) \
474 atomic_inc_64(&(p)->x.value.ui64)
475
476 #define CLNT_MAX_CONNS 1 /* concurrent connections between clnt/srvr */
477 int clnt_max_conns = CLNT_MAX_CONNS;
478
479 #define CLNT_MIN_TIMEOUT 10 /* seconds to wait after we get a */
480 /* connection reset */
481 #define CLNT_MIN_CONNTIMEOUT 5 /* seconds to wait for a connection */
482
483
484 int clnt_cots_min_tout = CLNT_MIN_TIMEOUT;
485 int clnt_cots_min_conntout = CLNT_MIN_CONNTIMEOUT;
486
487 /*
488 * Limit the number of times we will attempt to receive a reply without
489 * re-sending a response.
490 */
491 #define CLNT_MAXRECV_WITHOUT_RETRY 3
492 uint_t clnt_cots_maxrecv = CLNT_MAXRECV_WITHOUT_RETRY;
493
494 uint_t *clnt_max_msg_sizep;
495 void (*clnt_stop_idle)(queue_t *wq);
496
497 #define ptoh(p) (&((p)->cku_client))
498 #define htop(h) ((cku_private_t *)((h)->cl_private))
499
500 /*
501 * Times to retry
502 */
503 #define REFRESHES 2 /* authentication refreshes */
504
505 /*
506 * The following is used to determine the global default behavior for
507 * COTS when binding to a local port.
508 *
509 * If the value is set to 1 the default will be to select a reserved
510 * (aka privileged) port, if the value is zero the default will be to
511 * use non-reserved ports. Users of kRPC may override this by using
512 * CLNT_CONTROL() and CLSET_BINDRESVPORT.
513 */
514 int clnt_cots_do_bindresvport = 1;
515
516 static zone_key_t zone_cots_key;
517
518 /*
519 * Defaults TCP send and receive buffer size for RPC connections.
520 * These values can be tuned by /etc/system.
521 */
522 int rpc_send_bufsz = 1024*1024;
523 int rpc_recv_bufsz = 1024*1024;
524 /*
525 * To use system-wide default for TCP send and receive buffer size,
526 * use /etc/system to set rpc_default_tcp_bufsz to 1:
527 *
528 * set rpcmod:rpc_default_tcp_bufsz=1
529 */
530 int rpc_default_tcp_bufsz = 0;
531
532 /*
533 * We need to do this after all kernel threads in the zone have exited.
534 */
535 /* ARGSUSED */
536 static void
537 clnt_zone_destroy(zoneid_t zoneid, void *unused)
538 {
539 struct cm_xprt **cmp;
540 struct cm_xprt *cm_entry;
541 struct cm_xprt *freelist = NULL;
542
543 mutex_enter(&connmgr_lock);
544 cmp = &cm_hd;
545 while ((cm_entry = *cmp) != NULL) {
546 if (cm_entry->x_zoneid == zoneid) {
547 *cmp = cm_entry->x_next;
548 cm_entry->x_next = freelist;
549 freelist = cm_entry;
550 } else {
551 cmp = &cm_entry->x_next;
552 }
553 }
554 mutex_exit(&connmgr_lock);
555 while ((cm_entry = freelist) != NULL) {
556 freelist = cm_entry->x_next;
557 connmgr_close(cm_entry);
558 }
559 }
560
561 int
562 clnt_cots_kcreate(dev_t dev, struct netbuf *addr, int family, rpcprog_t prog,
563 rpcvers_t vers, uint_t max_msgsize, cred_t *cred, CLIENT **ncl)
564 {
565 CLIENT *h;
566 cku_private_t *p;
567 struct rpc_msg call_msg;
568 struct rpcstat *rpcstat;
569
570 RPCLOG(8, "clnt_cots_kcreate: prog %u\n", prog);
571
572 rpcstat = zone_getspecific(rpcstat_zone_key, rpc_zone());
573 ASSERT(rpcstat != NULL);
574
575 /* Allocate and intialize the client handle. */
576 p = kmem_zalloc(sizeof (*p), KM_SLEEP);
577
578 h = ptoh(p);
579
580 h->cl_private = (caddr_t)p;
581 h->cl_auth = authkern_create();
582 h->cl_ops = &tcp_ops;
583
584 cv_init(&p->cku_call.call_cv, NULL, CV_DEFAULT, NULL);
585 mutex_init(&p->cku_call.call_lock, NULL, MUTEX_DEFAULT, NULL);
586
587 /*
588 * If the current sanity check size in rpcmod is smaller
589 * than the size needed, then increase the sanity check.
590 */
591 if (max_msgsize != 0 && clnt_max_msg_sizep != NULL &&
592 max_msgsize > *clnt_max_msg_sizep) {
593 mutex_enter(&clnt_max_msg_lock);
594 if (max_msgsize > *clnt_max_msg_sizep)
595 *clnt_max_msg_sizep = max_msgsize;
596 mutex_exit(&clnt_max_msg_lock);
597 }
598
599 p->cku_outbuflen = COTS_DEFAULT_ALLOCSIZE;
600
601 /* Preserialize the call message header */
602
603 call_msg.rm_xid = 0;
604 call_msg.rm_direction = CALL;
605 call_msg.rm_call.cb_rpcvers = RPC_MSG_VERSION;
606 call_msg.rm_call.cb_prog = prog;
607 call_msg.rm_call.cb_vers = vers;
608
609 xdrmem_create(&p->cku_outxdr, p->cku_rpchdr, WIRE_HDR_SIZE, XDR_ENCODE);
610
611 if (!xdr_callhdr(&p->cku_outxdr, &call_msg)) {
612 XDR_DESTROY(&p->cku_outxdr);
613 RPCLOG0(1, "clnt_cots_kcreate - Fatal header serialization "
614 "error\n");
615 auth_destroy(h->cl_auth);
616 kmem_free(p, sizeof (cku_private_t));
617 RPCLOG0(1, "clnt_cots_kcreate: create failed error EINVAL\n");
618 return (EINVAL); /* XXX */
619 }
620 XDR_DESTROY(&p->cku_outxdr);
621
622 /*
623 * The zalloc initialized the fields below.
624 * p->cku_xid = 0;
625 * p->cku_flags = 0;
626 * p->cku_srcaddr.len = 0;
627 * p->cku_srcaddr.maxlen = 0;
628 */
629
630 p->cku_cred = cred;
631 p->cku_device = dev;
632 p->cku_addrfmly = family;
633 p->cku_addr.buf = kmem_zalloc(addr->maxlen, KM_SLEEP);
634 p->cku_addr.maxlen = addr->maxlen;
635 p->cku_addr.len = addr->len;
636 bcopy(addr->buf, p->cku_addr.buf, addr->len);
637 p->cku_stats = rpcstat->rpc_cots_client;
638 p->cku_useresvport = -1; /* value is has not been set */
639
640 *ncl = h;
641 return (0);
642 }
643
644 /*ARGSUSED*/
645 static void
646 clnt_cots_kabort(CLIENT *h)
647 {
648 }
649
650 /*
651 * Return error info on this handle.
652 */
653 static void
654 clnt_cots_kerror(CLIENT *h, struct rpc_err *err)
655 {
656 /* LINTED pointer alignment */
657 cku_private_t *p = htop(h);
658
659 *err = p->cku_err;
660 }
661
662 /*ARGSUSED*/
663 static bool_t
664 clnt_cots_kfreeres(CLIENT *h, xdrproc_t xdr_res, caddr_t res_ptr)
665 {
666 xdr_free(xdr_res, res_ptr);
667
668 return (TRUE);
669 }
670
671 static bool_t
672 clnt_cots_kcontrol(CLIENT *h, int cmd, char *arg)
673 {
674 cku_private_t *p = htop(h);
675
676 switch (cmd) {
677 case CLSET_PROGRESS:
678 p->cku_progress = TRUE;
679 return (TRUE);
680
681 case CLSET_XID:
682 if (arg == NULL)
683 return (FALSE);
684
685 p->cku_xid = *((uint32_t *)arg);
686 return (TRUE);
687
688 case CLGET_XID:
689 if (arg == NULL)
690 return (FALSE);
691
692 *((uint32_t *)arg) = p->cku_xid;
693 return (TRUE);
694
695 case CLSET_NODELAYONERR:
696 if (arg == NULL)
697 return (FALSE);
698
699 if (*((bool_t *)arg) == TRUE) {
700 p->cku_nodelayonerr = TRUE;
701 return (TRUE);
702 }
703 if (*((bool_t *)arg) == FALSE) {
704 p->cku_nodelayonerr = FALSE;
705 return (TRUE);
706 }
707 return (FALSE);
708
709 case CLGET_NODELAYONERR:
710 if (arg == NULL)
711 return (FALSE);
712
713 *((bool_t *)arg) = p->cku_nodelayonerr;
714 return (TRUE);
715
716 case CLSET_BINDRESVPORT:
717 if (arg == NULL)
718 return (FALSE);
719
720 if (*(int *)arg != 1 && *(int *)arg != 0)
721 return (FALSE);
722
723 p->cku_useresvport = *(int *)arg;
724
725 return (TRUE);
726
727 case CLGET_BINDRESVPORT:
728 if (arg == NULL)
729 return (FALSE);
730
731 *(int *)arg = p->cku_useresvport;
732
733 return (TRUE);
734
735 default:
736 return (FALSE);
737 }
738 }
739
740 /*
741 * Destroy rpc handle. Frees the space used for output buffer,
742 * private data, and handle structure.
743 */
744 static void
745 clnt_cots_kdestroy(CLIENT *h)
746 {
747 /* LINTED pointer alignment */
748 cku_private_t *p = htop(h);
749 calllist_t *call = &p->cku_call;
750
751 RPCLOG(8, "clnt_cots_kdestroy h: %p\n", (void *)h);
752 RPCLOG(8, "clnt_cots_kdestroy h: xid=0x%x\n", p->cku_xid);
753
754 if (p->cku_flags & CKU_ONQUEUE) {
755 RPCLOG(64, "clnt_cots_kdestroy h: removing call for xid 0x%x "
756 "from dispatch list\n", p->cku_xid);
757 call_table_remove(call);
758 }
759
760 if (call->call_reply)
761 freemsg(call->call_reply);
762 cv_destroy(&call->call_cv);
763 mutex_destroy(&call->call_lock);
764
765 kmem_free(p->cku_srcaddr.buf, p->cku_srcaddr.maxlen);
766 kmem_free(p->cku_addr.buf, p->cku_addr.maxlen);
767 kmem_free(p, sizeof (*p));
768 }
769
770 static int clnt_cots_pulls;
771 #define RM_HDR_SIZE 4 /* record mark header size */
772
773 /*
774 * Call remote procedure.
775 */
776 static enum clnt_stat
777 clnt_cots_kcallit(CLIENT *h, rpcproc_t procnum, xdrproc_t xdr_args,
778 caddr_t argsp, xdrproc_t xdr_results, caddr_t resultsp, struct timeval wait)
779 {
780 /* LINTED pointer alignment */
781 cku_private_t *p = htop(h);
782 calllist_t *call = &p->cku_call;
783 XDR *xdrs;
784 struct rpc_msg reply_msg;
785 mblk_t *mp;
786 #ifdef RPCDEBUG
787 clock_t time_sent;
788 #endif
789 struct netbuf *retryaddr;
790 struct cm_xprt *cm_entry = NULL;
791 queue_t *wq;
792 int len, waitsecs, max_waitsecs;
793 int mpsize;
794 int refreshes = REFRESHES;
795 int interrupted;
796 int tidu_size;
797 enum clnt_stat status;
798 struct timeval cwait;
799 bool_t delay_first = FALSE;
800 clock_t ticks, now;
801
802 RPCLOG(2, "clnt_cots_kcallit, procnum %u\n", procnum);
803 COTSRCSTAT_INCR(p->cku_stats, rccalls);
804
805 RPCLOG(2, "clnt_cots_kcallit: wait.tv_sec: %ld\n", wait.tv_sec);
806 RPCLOG(2, "clnt_cots_kcallit: wait.tv_usec: %ld\n", wait.tv_usec);
807 /*
808 * Bug ID 1240234:
809 * Look out for zero length timeouts. We don't want to
810 * wait zero seconds for a connection to be established.
811 */
812 if (wait.tv_sec < clnt_cots_min_conntout) {
813 cwait.tv_sec = clnt_cots_min_conntout;
814 cwait.tv_usec = 0;
815 RPCLOG(8, "clnt_cots_kcallit: wait.tv_sec (%ld) too low,",
816 wait.tv_sec);
817 RPCLOG(8, " setting to: %d\n", clnt_cots_min_conntout);
818 } else {
819 cwait = wait;
820 }
821
822 call_again:
823 if (cm_entry) {
824 connmgr_release(cm_entry);
825 cm_entry = NULL;
826 }
827
828 mp = NULL;
829
830 /*
831 * If the call is not a retry, allocate a new xid and cache it
832 * for future retries.
833 * Bug ID 1246045:
834 * Treat call as a retry for purposes of binding the source
835 * port only if we actually attempted to send anything on
836 * the previous call.
837 */
838 if (p->cku_xid == 0) {
839 p->cku_xid = alloc_xid();
840 call->call_zoneid = rpc_zoneid();
841
842 /*
843 * We need to ASSERT here that our xid != 0 because this
844 * determines whether or not our call record gets placed on
845 * the hash table or the linked list. By design, we mandate
846 * that RPC calls over cots must have xid's != 0, so we can
847 * ensure proper management of the hash table.
848 */
849 ASSERT(p->cku_xid != 0);
850
851 retryaddr = NULL;
852 p->cku_flags &= ~CKU_SENT;
853
854 if (p->cku_flags & CKU_ONQUEUE) {
855 RPCLOG(8, "clnt_cots_kcallit: new call, dequeuing old"
856 " one (%p)\n", (void *)call);
857 call_table_remove(call);
858 p->cku_flags &= ~CKU_ONQUEUE;
859 RPCLOG(64, "clnt_cots_kcallit: removing call from "
860 "dispatch list because xid was zero (now 0x%x)\n",
861 p->cku_xid);
862 }
863
864 if (call->call_reply != NULL) {
865 freemsg(call->call_reply);
866 call->call_reply = NULL;
867 }
868 } else if (p->cku_srcaddr.buf == NULL || p->cku_srcaddr.len == 0) {
869 retryaddr = NULL;
870
871 } else if (p->cku_flags & CKU_SENT) {
872 retryaddr = &p->cku_srcaddr;
873
874 } else {
875 /*
876 * Bug ID 1246045: Nothing was sent, so set retryaddr to
877 * NULL and let connmgr_get() bind to any source port it
878 * can get.
879 */
880 retryaddr = NULL;
881 }
882
883 RPCLOG(64, "clnt_cots_kcallit: xid = 0x%x", p->cku_xid);
884 RPCLOG(64, " flags = 0x%x\n", p->cku_flags);
885
886 p->cku_err.re_status = RPC_TIMEDOUT;
887 p->cku_err.re_errno = p->cku_err.re_terrno = 0;
888
889 cm_entry = connmgr_wrapget(retryaddr, &cwait, p);
890
891 if (cm_entry == NULL) {
892 RPCLOG(1, "clnt_cots_kcallit: can't connect status %s\n",
893 clnt_sperrno(p->cku_err.re_status));
894
895 /*
896 * The reasons why we fail to create a connection are
897 * varied. In most cases we don't want the caller to
898 * immediately retry. This could have one or more
899 * bad effects. This includes flooding the net with
900 * connect requests to ports with no listener; a hard
901 * kernel loop due to all the "reserved" TCP ports being
902 * in use.
903 */
904 delay_first = TRUE;
905
906 /*
907 * Even if we end up returning EINTR, we still count a
908 * a "can't connect", because the connection manager
909 * might have been committed to waiting for or timing out on
910 * a connection.
911 */
912 COTSRCSTAT_INCR(p->cku_stats, rccantconn);
913 switch (p->cku_err.re_status) {
914 case RPC_INTR:
915 p->cku_err.re_errno = EINTR;
916
917 /*
918 * No need to delay because a UNIX signal(2)
919 * interrupted us. The caller likely won't
920 * retry the CLNT_CALL() and even if it does,
921 * we assume the caller knows what it is doing.
922 */
923 delay_first = FALSE;
924 break;
925
926 case RPC_TIMEDOUT:
927 p->cku_err.re_errno = ETIMEDOUT;
928
929 /*
930 * No need to delay because timed out already
931 * on the connection request and assume that the
932 * transport time out is longer than our minimum
933 * timeout, or least not too much smaller.
934 */
935 delay_first = FALSE;
936 break;
937
938 case RPC_SYSTEMERROR:
939 case RPC_TLIERROR:
940 /*
941 * We want to delay here because a transient
942 * system error has a better chance of going away
943 * if we delay a bit. If it's not transient, then
944 * we don't want end up in a hard kernel loop
945 * due to retries.
946 */
947 ASSERT(p->cku_err.re_errno != 0);
948 break;
949
950
951 case RPC_CANTCONNECT:
952 /*
953 * RPC_CANTCONNECT is set on T_ERROR_ACK which
954 * implies some error down in the TCP layer or
955 * below. If cku_nodelayonerror is set then we
956 * assume the caller knows not to try too hard.
957 */
958 RPCLOG0(8, "clnt_cots_kcallit: connection failed,");
959 RPCLOG0(8, " re_status=RPC_CANTCONNECT,");
960 RPCLOG(8, " re_errno=%d,", p->cku_err.re_errno);
961 RPCLOG(8, " cku_nodelayonerr=%d", p->cku_nodelayonerr);
962 if (p->cku_nodelayonerr == TRUE)
963 delay_first = FALSE;
964
965 p->cku_err.re_errno = EIO;
966
967 break;
968
969 case RPC_XPRTFAILED:
970 /*
971 * We want to delay here because we likely
972 * got a refused connection.
973 */
974 if (p->cku_err.re_errno == 0)
975 p->cku_err.re_errno = EIO;
976
977 RPCLOG(1, "clnt_cots_kcallit: transport failed: %d\n",
978 p->cku_err.re_errno);
979
980 break;
981
982 default:
983 /*
984 * We delay here because it is better to err
985 * on the side of caution. If we got here then
986 * status could have been RPC_SUCCESS, but we
987 * know that we did not get a connection, so
988 * force the rpc status to RPC_CANTCONNECT.
989 */
990 p->cku_err.re_status = RPC_CANTCONNECT;
991 p->cku_err.re_errno = EIO;
992 break;
993 }
994 if (delay_first == TRUE)
995 ticks = clnt_cots_min_tout * drv_usectohz(1000000);
996 goto cots_done;
997 }
998
999 /*
1000 * If we've never sent any request on this connection (send count
1001 * is zero, or the connection has been reset), cache the
1002 * the connection's create time and send a request (possibly a retry)
1003 */
1004 if ((p->cku_flags & CKU_SENT) == 0 ||
1005 p->cku_ctime != cm_entry->x_ctime) {
1006 p->cku_ctime = cm_entry->x_ctime;
1007
1008 } else if ((p->cku_flags & CKU_SENT) && (p->cku_flags & CKU_ONQUEUE) &&
1009 (call->call_reply != NULL ||
1010 p->cku_recv_attempts < clnt_cots_maxrecv)) {
1011
1012 /*
1013 * If we've sent a request and our call is on the dispatch
1014 * queue and we haven't made too many receive attempts, then
1015 * don't re-send, just receive.
1016 */
1017 p->cku_recv_attempts++;
1018 goto read_again;
1019 }
1020
1021 /*
1022 * Now we create the RPC request in a STREAMS message. We have to do
1023 * this after the call to connmgr_get so that we have the correct
1024 * TIDU size for the transport.
1025 */
1026 tidu_size = cm_entry->x_tidu_size;
1027 len = MSG_OFFSET + MAX(tidu_size, RM_HDR_SIZE + WIRE_HDR_SIZE);
1028
1029 while ((mp = allocb(len, BPRI_MED)) == NULL) {
1030 if (strwaitbuf(len, BPRI_MED)) {
1031 p->cku_err.re_status = RPC_SYSTEMERROR;
1032 p->cku_err.re_errno = ENOSR;
1033 COTSRCSTAT_INCR(p->cku_stats, rcnomem);
1034 goto cots_done;
1035 }
1036 }
1037 xdrs = &p->cku_outxdr;
1038 xdrmblk_init(xdrs, mp, XDR_ENCODE, tidu_size);
1039 mpsize = MBLKSIZE(mp);
1040 ASSERT(mpsize >= len);
1041 ASSERT(mp->b_rptr == mp->b_datap->db_base);
1042
1043 /*
1044 * If the size of mblk is not appreciably larger than what we
1045 * asked, then resize the mblk to exactly len bytes. The reason for
1046 * this: suppose len is 1600 bytes, the tidu is 1460 bytes
1047 * (from TCP over ethernet), and the arguments to the RPC require
1048 * 2800 bytes. Ideally we want the protocol to render two
1049 * ~1400 byte segments over the wire. However if allocb() gives us a 2k
1050 * mblk, and we allocate a second mblk for the remainder, the protocol
1051 * module may generate 3 segments over the wire:
1052 * 1460 bytes for the first, 448 (2048 - 1600) for the second, and
1053 * 892 for the third. If we "waste" 448 bytes in the first mblk,
1054 * the XDR encoding will generate two ~1400 byte mblks, and the
1055 * protocol module is more likely to produce properly sized segments.
1056 */
1057 if ((mpsize >> 1) <= len)
1058 mp->b_rptr += (mpsize - len);
1059
1060 /*
1061 * Adjust b_rptr to reserve space for the non-data protocol headers
1062 * any downstream modules might like to add, and for the
1063 * record marking header.
1064 */
1065 mp->b_rptr += (MSG_OFFSET + RM_HDR_SIZE);
1066
1067 if (h->cl_auth->ah_cred.oa_flavor != RPCSEC_GSS) {
1068 /* Copy in the preserialized RPC header information. */
1069 bcopy(p->cku_rpchdr, mp->b_rptr, WIRE_HDR_SIZE);
1070
1071 /* Use XDR_SETPOS() to set the b_wptr to past the RPC header. */
1072 XDR_SETPOS(xdrs, (uint_t)(mp->b_rptr - mp->b_datap->db_base +
1073 WIRE_HDR_SIZE));
1074
1075 ASSERT((mp->b_wptr - mp->b_rptr) == WIRE_HDR_SIZE);
1076
1077 /* Serialize the procedure number and the arguments. */
1078 if ((!XDR_PUTINT32(xdrs, (int32_t *)&procnum)) ||
1079 (!AUTH_MARSHALL(h->cl_auth, xdrs, p->cku_cred)) ||
1080 (!(*xdr_args)(xdrs, argsp))) {
1081 XDR_DESTROY(xdrs);
1082 p->cku_err.re_status = RPC_CANTENCODEARGS;
1083 p->cku_err.re_errno = EIO;
1084 goto cots_done;
1085 }
1086
1087 (*(uint32_t *)(mp->b_rptr)) = p->cku_xid;
1088 } else {
1089 uint32_t *uproc = (uint32_t *)&p->cku_rpchdr[WIRE_HDR_SIZE];
1090 IXDR_PUT_U_INT32(uproc, procnum);
1091
1092 (*(uint32_t *)(&p->cku_rpchdr[0])) = p->cku_xid;
1093
1094 /* Use XDR_SETPOS() to set the b_wptr. */
1095 XDR_SETPOS(xdrs, (uint_t)(mp->b_rptr - mp->b_datap->db_base));
1096
1097 /* Serialize the procedure number and the arguments. */
1098 if (!AUTH_WRAP(h->cl_auth, p->cku_rpchdr, WIRE_HDR_SIZE+4,
1099 xdrs, xdr_args, argsp)) {
1100 XDR_DESTROY(xdrs);
1101 p->cku_err.re_status = RPC_CANTENCODEARGS;
1102 p->cku_err.re_errno = EIO;
1103 goto cots_done;
1104 }
1105 }
1106
1107 XDR_DESTROY(xdrs);
1108
1109 RPCLOG(2, "clnt_cots_kcallit: connected, sending call, tidu_size %d\n",
1110 tidu_size);
1111
1112 wq = cm_entry->x_wq;
1113 waitsecs = 0;
1114
1115 dispatch_again:
1116 status = clnt_dispatch_send(wq, mp, call, p->cku_xid,
1117 (p->cku_flags & CKU_ONQUEUE));
1118
1119 if ((status == RPC_CANTSEND) && (call->call_reason == ENOBUFS)) {
1120 /*
1121 * QFULL condition, allow some time for queue to drain
1122 * and try again. Give up after waiting for all timeout
1123 * specified for the call, or zone is going away.
1124 */
1125 max_waitsecs = wait.tv_sec ? wait.tv_sec : clnt_cots_min_tout;
1126 if ((waitsecs++ < max_waitsecs) &&
1127 !(zone_status_get(curproc->p_zone) >=
1128 ZONE_IS_SHUTTING_DOWN)) {
1129
1130 /* wait 1 sec for queue to drain */
1131 if (clnt_delay(drv_usectohz(1000000),
1132 h->cl_nosignal) == EINTR) {
1133 p->cku_err.re_errno = EINTR;
1134 p->cku_err.re_status = RPC_INTR;
1135
1136 goto cots_done;
1137 }
1138
1139 /* and try again */
1140 goto dispatch_again;
1141 }
1142 p->cku_err.re_status = status;
1143 p->cku_err.re_errno = call->call_reason;
1144 DTRACE_PROBE(krpc__e__clntcots__kcallit__cantsend);
1145
1146 goto cots_done;
1147 }
1148
1149 if (waitsecs) {
1150 /* adjust timeout to account for time wait to send */
1151 wait.tv_sec -= waitsecs;
1152 if (wait.tv_sec < 0) {
1153 /* pick up reply on next retry */
1154 wait.tv_sec = 0;
1155 }
1156 DTRACE_PROBE2(clnt_cots__sendwait, CLIENT *, h,
1157 int, waitsecs);
1158 }
1159
1160 RPCLOG(64, "clnt_cots_kcallit: sent call for xid 0x%x\n",
1161 (uint_t)p->cku_xid);
1162 p->cku_flags = (CKU_ONQUEUE|CKU_SENT);
1163 p->cku_recv_attempts = 1;
1164
1165 #ifdef RPCDEBUG
1166 time_sent = ddi_get_lbolt();
1167 #endif
1168
1169 /*
1170 * Wait for a reply or a timeout. If there is no error or timeout,
1171 * (both indicated by call_status), call->call_reply will contain
1172 * the RPC reply message.
1173 */
1174 read_again:
1175 mutex_enter(&call->call_lock);
1176 interrupted = 0;
1177 if (call->call_status == RPC_TIMEDOUT) {
1178 /*
1179 * Indicate that the lwp is not to be stopped while waiting
1180 * for this network traffic. This is to avoid deadlock while
1181 * debugging a process via /proc and also to avoid recursive
1182 * mutex_enter()s due to NFS page faults while stopping
1183 * (NFS holds locks when it calls here).
1184 */
1185 clock_t cv_wait_ret;
1186 clock_t timout;
1187 clock_t oldlbolt;
1188
1189 klwp_t *lwp = ttolwp(curthread);
1190
1191 if (lwp != NULL)
1192 lwp->lwp_nostop++;
1193
1194 oldlbolt = ddi_get_lbolt();
1195 timout = wait.tv_sec * drv_usectohz(1000000) +
1196 drv_usectohz(wait.tv_usec) + oldlbolt;
1197 /*
1198 * Iterate until the call_status is changed to something
1199 * other that RPC_TIMEDOUT, or if cv_timedwait_sig() returns
1200 * something <=0 zero. The latter means that we timed
1201 * out.
1202 */
1203 if (h->cl_nosignal)
1204 while ((cv_wait_ret = cv_timedwait(&call->call_cv,
1205 &call->call_lock, timout)) > 0 &&
1206 call->call_status == RPC_TIMEDOUT)
1207 ;
1208 else
1209 while ((cv_wait_ret = cv_timedwait_sig(
1210 &call->call_cv,
1211 &call->call_lock, timout)) > 0 &&
1212 call->call_status == RPC_TIMEDOUT)
1213 ;
1214
1215 switch (cv_wait_ret) {
1216 case 0:
1217 /*
1218 * If we got out of the above loop with
1219 * cv_timedwait_sig() returning 0, then we were
1220 * interrupted regardless what call_status is.
1221 */
1222 interrupted = 1;
1223 break;
1224 case -1:
1225 /* cv_timedwait_sig() timed out */
1226 break;
1227 default:
1228
1229 /*
1230 * We were cv_signaled(). If we didn't
1231 * get a successful call_status and returned
1232 * before time expired, delay up to clnt_cots_min_tout
1233 * seconds so that the caller doesn't immediately
1234 * try to call us again and thus force the
1235 * same condition that got us here (such
1236 * as a RPC_XPRTFAILED due to the server not
1237 * listening on the end-point.
1238 */
1239 if (call->call_status != RPC_SUCCESS) {
1240 clock_t curlbolt;
1241 clock_t diff;
1242
1243 curlbolt = ddi_get_lbolt();
1244 ticks = clnt_cots_min_tout *
1245 drv_usectohz(1000000);
1246 diff = curlbolt - oldlbolt;
1247 if (diff < ticks) {
1248 delay_first = TRUE;
1249 if (diff > 0)
1250 ticks -= diff;
1251 }
1252 }
1253 break;
1254 }
1255
1256 if (lwp != NULL)
1257 lwp->lwp_nostop--;
1258 }
1259 /*
1260 * Get the reply message, if any. This will be freed at the end
1261 * whether or not an error occurred.
1262 */
1263 mp = call->call_reply;
1264 call->call_reply = NULL;
1265
1266 /*
1267 * call_err is the error info when the call is on dispatch queue.
1268 * cku_err is the error info returned to the caller.
1269 * Sync cku_err with call_err for local message processing.
1270 */
1271
1272 status = call->call_status;
1273 p->cku_err = call->call_err;
1274 mutex_exit(&call->call_lock);
1275
1276 if (status != RPC_SUCCESS) {
1277 switch (status) {
1278 case RPC_TIMEDOUT:
1279 now = ddi_get_lbolt();
1280 if (interrupted) {
1281 COTSRCSTAT_INCR(p->cku_stats, rcintrs);
1282 p->cku_err.re_status = RPC_INTR;
1283 p->cku_err.re_errno = EINTR;
1284 RPCLOG(1, "clnt_cots_kcallit: xid 0x%x",
1285 p->cku_xid);
1286 RPCLOG(1, "signal interrupted at %ld", now);
1287 RPCLOG(1, ", was sent at %ld\n", time_sent);
1288 } else {
1289 COTSRCSTAT_INCR(p->cku_stats, rctimeouts);
1290 p->cku_err.re_errno = ETIMEDOUT;
1291 RPCLOG(1, "clnt_cots_kcallit: timed out at %ld",
1292 now);
1293 RPCLOG(1, ", was sent at %ld\n", time_sent);
1294 }
1295 break;
1296
1297 case RPC_XPRTFAILED:
1298 if (p->cku_err.re_errno == 0)
1299 p->cku_err.re_errno = EIO;
1300
1301 RPCLOG(1, "clnt_cots_kcallit: transport failed: %d\n",
1302 p->cku_err.re_errno);
1303 break;
1304
1305 case RPC_SYSTEMERROR:
1306 ASSERT(p->cku_err.re_errno);
1307 RPCLOG(1, "clnt_cots_kcallit: system error: %d\n",
1308 p->cku_err.re_errno);
1309 break;
1310
1311 default:
1312 p->cku_err.re_status = RPC_SYSTEMERROR;
1313 p->cku_err.re_errno = EIO;
1314 RPCLOG(1, "clnt_cots_kcallit: error: %s\n",
1315 clnt_sperrno(status));
1316 break;
1317 }
1318 if (p->cku_err.re_status != RPC_TIMEDOUT) {
1319
1320 if (p->cku_flags & CKU_ONQUEUE) {
1321 call_table_remove(call);
1322 p->cku_flags &= ~CKU_ONQUEUE;
1323 }
1324
1325 RPCLOG(64, "clnt_cots_kcallit: non TIMEOUT so xid 0x%x "
1326 "taken off dispatch list\n", p->cku_xid);
1327 if (call->call_reply) {
1328 freemsg(call->call_reply);
1329 call->call_reply = NULL;
1330 }
1331 } else if (wait.tv_sec != 0) {
1332 /*
1333 * We've sent the request over TCP and so we have
1334 * every reason to believe it will get
1335 * delivered. In which case returning a timeout is not
1336 * appropriate.
1337 */
1338 if (p->cku_progress == TRUE &&
1339 p->cku_recv_attempts < clnt_cots_maxrecv) {
1340 p->cku_err.re_status = RPC_INPROGRESS;
1341 }
1342 }
1343 goto cots_done;
1344 }
1345
1346 xdrs = &p->cku_inxdr;
1347 xdrmblk_init(xdrs, mp, XDR_DECODE, 0);
1348
1349 reply_msg.rm_direction = REPLY;
1350 reply_msg.rm_reply.rp_stat = MSG_ACCEPTED;
1351 reply_msg.acpted_rply.ar_stat = SUCCESS;
1352
1353 reply_msg.acpted_rply.ar_verf = _null_auth;
1354 /*
1355 * xdr_results will be done in AUTH_UNWRAP.
1356 */
1357 reply_msg.acpted_rply.ar_results.where = NULL;
1358 reply_msg.acpted_rply.ar_results.proc = xdr_void;
1359
1360 if (xdr_replymsg(xdrs, &reply_msg)) {
1361 enum clnt_stat re_status;
1362
1363 _seterr_reply(&reply_msg, &p->cku_err);
1364
1365 re_status = p->cku_err.re_status;
1366 if (re_status == RPC_SUCCESS) {
1367 /*
1368 * Reply is good, check auth.
1369 */
1370 if (!AUTH_VALIDATE(h->cl_auth,
1371 &reply_msg.acpted_rply.ar_verf)) {
1372 COTSRCSTAT_INCR(p->cku_stats, rcbadverfs);
1373 RPCLOG0(1, "clnt_cots_kcallit: validation "
1374 "failure\n");
1375 freemsg(mp);
1376 (void) xdr_rpc_free_verifier(xdrs, &reply_msg);
1377 XDR_DESTROY(xdrs);
1378 mutex_enter(&call->call_lock);
1379 if (call->call_reply == NULL)
1380 call->call_status = RPC_TIMEDOUT;
1381 mutex_exit(&call->call_lock);
1382 goto read_again;
1383 } else if (!AUTH_UNWRAP(h->cl_auth, xdrs,
1384 xdr_results, resultsp)) {
1385 RPCLOG0(1, "clnt_cots_kcallit: validation "
1386 "failure (unwrap)\n");
1387 p->cku_err.re_status = RPC_CANTDECODERES;
1388 p->cku_err.re_errno = EIO;
1389 }
1390 } else {
1391 /* set errno in case we can't recover */
1392 if (re_status != RPC_VERSMISMATCH &&
1393 re_status != RPC_AUTHERROR &&
1394 re_status != RPC_PROGVERSMISMATCH)
1395 p->cku_err.re_errno = EIO;
1396
1397 if (re_status == RPC_AUTHERROR) {
1398 /*
1399 * Maybe our credential need to be refreshed
1400 */
1401 if (cm_entry) {
1402 /*
1403 * There is the potential that the
1404 * cm_entry has/will be marked dead,
1405 * so drop the connection altogether,
1406 * force REFRESH to establish new
1407 * connection.
1408 */
1409 connmgr_cancelconn(cm_entry);
1410 cm_entry = NULL;
1411 }
1412
1413 (void) xdr_rpc_free_verifier(xdrs,
1414 &reply_msg);
1415 XDR_DESTROY(xdrs);
1416
1417 if (p->cku_flags & CKU_ONQUEUE) {
1418 call_table_remove(call);
1419 p->cku_flags &= ~CKU_ONQUEUE;
1420 }
1421 RPCLOG(64,
1422 "clnt_cots_kcallit: AUTH_ERROR, xid"
1423 " 0x%x removed off dispatch list\n",
1424 p->cku_xid);
1425 if (call->call_reply) {
1426 freemsg(call->call_reply);
1427 call->call_reply = NULL;
1428 }
1429
1430 if ((refreshes > 0) &&
1431 AUTH_REFRESH(h->cl_auth, &reply_msg,
1432 p->cku_cred)) {
1433 refreshes--;
1434 freemsg(mp);
1435 mp = NULL;
1436
1437 COTSRCSTAT_INCR(p->cku_stats,
1438 rcbadcalls);
1439 COTSRCSTAT_INCR(p->cku_stats,
1440 rcnewcreds);
1441 goto call_again;
1442 }
1443
1444 /*
1445 * We have used the client handle to
1446 * do an AUTH_REFRESH and the RPC status may
1447 * be set to RPC_SUCCESS; Let's make sure to
1448 * set it to RPC_AUTHERROR.
1449 */
1450 p->cku_err.re_status = RPC_AUTHERROR;
1451
1452 /*
1453 * Map recoverable and unrecoverable
1454 * authentication errors to appropriate errno
1455 */
1456 switch (p->cku_err.re_why) {
1457 case AUTH_TOOWEAK:
1458 /*
1459 * This could be a failure where the
1460 * server requires use of a reserved
1461 * port, check and optionally set the
1462 * client handle useresvport trying
1463 * one more time. Next go round we
1464 * fall out with the tooweak error.
1465 */
1466 if (p->cku_useresvport != 1) {
1467 p->cku_useresvport = 1;
1468 p->cku_xid = 0;
1469 freemsg(mp);
1470 mp = NULL;
1471 goto call_again;
1472 }
1473 /* FALLTHRU */
1474 case AUTH_BADCRED:
1475 case AUTH_BADVERF:
1476 case AUTH_INVALIDRESP:
1477 case AUTH_FAILED:
1478 case RPCSEC_GSS_NOCRED:
1479 case RPCSEC_GSS_FAILED:
1480 p->cku_err.re_errno = EACCES;
1481 break;
1482 case AUTH_REJECTEDCRED:
1483 case AUTH_REJECTEDVERF:
1484 default: p->cku_err.re_errno = EIO;
1485 break;
1486 }
1487 RPCLOG(1, "clnt_cots_kcallit : authentication"
1488 " failed with RPC_AUTHERROR of type %d\n",
1489 (int)p->cku_err.re_why);
1490 goto cots_done;
1491 }
1492 }
1493 } else {
1494 /* reply didn't decode properly. */
1495 p->cku_err.re_status = RPC_CANTDECODERES;
1496 p->cku_err.re_errno = EIO;
1497 RPCLOG0(1, "clnt_cots_kcallit: decode failure\n");
1498 }
1499
1500 (void) xdr_rpc_free_verifier(xdrs, &reply_msg);
1501 XDR_DESTROY(xdrs);
1502
1503 if (p->cku_flags & CKU_ONQUEUE) {
1504 call_table_remove(call);
1505 p->cku_flags &= ~CKU_ONQUEUE;
1506 }
1507
1508 RPCLOG(64, "clnt_cots_kcallit: xid 0x%x taken off dispatch list",
1509 p->cku_xid);
1510 RPCLOG(64, " status is %s\n", clnt_sperrno(p->cku_err.re_status));
1511 cots_done:
1512 if (cm_entry)
1513 connmgr_release(cm_entry);
1514
1515 if (mp != NULL)
1516 freemsg(mp);
1517 if ((p->cku_flags & CKU_ONQUEUE) == 0 && call->call_reply) {
1518 freemsg(call->call_reply);
1519 call->call_reply = NULL;
1520 }
1521 if (p->cku_err.re_status != RPC_SUCCESS) {
1522 RPCLOG0(1, "clnt_cots_kcallit: tail-end failure\n");
1523 COTSRCSTAT_INCR(p->cku_stats, rcbadcalls);
1524 }
1525
1526 /*
1527 * No point in delaying if the zone is going away.
1528 */
1529 if (delay_first == TRUE &&
1530 !(zone_status_get(curproc->p_zone) >= ZONE_IS_SHUTTING_DOWN)) {
1531 if (clnt_delay(ticks, h->cl_nosignal) == EINTR) {
1532 p->cku_err.re_errno = EINTR;
1533 p->cku_err.re_status = RPC_INTR;
1534 }
1535 }
1536 return (p->cku_err.re_status);
1537 }
1538
1539 /*
1540 * Kinit routine for cots. This sets up the correct operations in
1541 * the client handle, as the handle may have previously been a clts
1542 * handle, and clears the xid field so there is no way a new call
1543 * could be mistaken for a retry. It also sets in the handle the
1544 * information that is passed at create/kinit time but needed at
1545 * call time, as cots creates the transport at call time - device,
1546 * address of the server, protocol family.
1547 */
1548 void
1549 clnt_cots_kinit(CLIENT *h, dev_t dev, int family, struct netbuf *addr,
1550 int max_msgsize, cred_t *cred)
1551 {
1552 /* LINTED pointer alignment */
1553 cku_private_t *p = htop(h);
1554 calllist_t *call = &p->cku_call;
1555
1556 h->cl_ops = &tcp_ops;
1557 if (p->cku_flags & CKU_ONQUEUE) {
1558 call_table_remove(call);
1559 p->cku_flags &= ~CKU_ONQUEUE;
1560 RPCLOG(64, "clnt_cots_kinit: removing call for xid 0x%x from"
1561 " dispatch list\n", p->cku_xid);
1562 }
1563
1564 if (call->call_reply != NULL) {
1565 freemsg(call->call_reply);
1566 call->call_reply = NULL;
1567 }
1568
1569 call->call_bucket = NULL;
1570 call->call_hash = 0;
1571
1572 /*
1573 * We don't clear cku_flags here, because clnt_cots_kcallit()
1574 * takes care of handling the cku_flags reset.
1575 */
1576 p->cku_xid = 0;
1577 p->cku_device = dev;
1578 p->cku_addrfmly = family;
1579 p->cku_cred = cred;
1580
1581 if (p->cku_addr.maxlen < addr->len) {
1582 if (p->cku_addr.maxlen != 0 && p->cku_addr.buf != NULL)
1583 kmem_free(p->cku_addr.buf, p->cku_addr.maxlen);
1584 p->cku_addr.buf = kmem_zalloc(addr->maxlen, KM_SLEEP);
1585 p->cku_addr.maxlen = addr->maxlen;
1586 }
1587
1588 p->cku_addr.len = addr->len;
1589 bcopy(addr->buf, p->cku_addr.buf, addr->len);
1590
1591 /*
1592 * If the current sanity check size in rpcmod is smaller
1593 * than the size needed, then increase the sanity check.
1594 */
1595 if (max_msgsize != 0 && clnt_max_msg_sizep != NULL &&
1596 max_msgsize > *clnt_max_msg_sizep) {
1597 mutex_enter(&clnt_max_msg_lock);
1598 if (max_msgsize > *clnt_max_msg_sizep)
1599 *clnt_max_msg_sizep = max_msgsize;
1600 mutex_exit(&clnt_max_msg_lock);
1601 }
1602 }
1603
1604 /*
1605 * ksettimers is a no-op for cots, with the exception of setting the xid.
1606 */
1607 /* ARGSUSED */
1608 static int
1609 clnt_cots_ksettimers(CLIENT *h, struct rpc_timers *t, struct rpc_timers *all,
1610 int minimum, void (*feedback)(int, int, caddr_t), caddr_t arg, uint32_t xid)
1611 {
1612 /* LINTED pointer alignment */
1613 cku_private_t *p = htop(h);
1614
1615 if (xid)
1616 p->cku_xid = xid;
1617 COTSRCSTAT_INCR(p->cku_stats, rctimers);
1618 return (0);
1619 }
1620
1621 extern void rpc_poptimod(struct vnode *);
1622 extern int kstr_push(struct vnode *, char *);
1623
1624 int
1625 conn_kstat_update(kstat_t *ksp, int rw)
1626 {
1627 struct cm_xprt *cm_entry;
1628 struct cm_kstat_xprt *cm_ksp_data;
1629 uchar_t *b;
1630 char *fbuf;
1631
1632 if (rw == KSTAT_WRITE)
1633 return (EACCES);
1634 if (ksp == NULL || ksp->ks_private == NULL)
1635 return (EIO);
1636 cm_entry = (struct cm_xprt *)ksp->ks_private;
1637 cm_ksp_data = (struct cm_kstat_xprt *)ksp->ks_data;
1638
1639 cm_ksp_data->x_wq.value.ui32 = (uint32_t)(uintptr_t)cm_entry->x_wq;
1640 cm_ksp_data->x_family.value.ui32 = cm_entry->x_family;
1641 cm_ksp_data->x_rdev.value.ui32 = (uint32_t)cm_entry->x_rdev;
1642 cm_ksp_data->x_time.value.ui32 = cm_entry->x_time;
1643 cm_ksp_data->x_ref.value.ui32 = cm_entry->x_ref;
1644 cm_ksp_data->x_state.value.ui32 = cm_entry->x_state_flags;
1645
1646 if (cm_entry->x_server.buf) {
1647 fbuf = cm_ksp_data->x_server.value.str.addr.ptr;
1648 if (cm_entry->x_family == AF_INET &&
1649 cm_entry->x_server.len ==
1650 sizeof (struct sockaddr_in)) {
1651 struct sockaddr_in *sa;
1652 sa = (struct sockaddr_in *)
1653 cm_entry->x_server.buf;
1654 b = (uchar_t *)&sa->sin_addr;
1655 (void) sprintf(fbuf,
1656 "%d.%d.%d.%d", b[0] & 0xFF, b[1] & 0xFF,
1657 b[2] & 0xFF, b[3] & 0xFF);
1658 cm_ksp_data->x_port.value.ui32 = ntohs(sa->sin_port);
1659 } else if (cm_entry->x_family == AF_INET6 &&
1660 cm_entry->x_server.len >=
1661 sizeof (struct sockaddr_in6)) {
1662 /* extract server IP address & port */
1663 struct sockaddr_in6 *sin6;
1664 sin6 = (struct sockaddr_in6 *)cm_entry->x_server.buf;
1665 (void) kinet_ntop6((uchar_t *)&sin6->sin6_addr, fbuf,
1666 INET6_ADDRSTRLEN);
1667 cm_ksp_data->x_port.value.ui32 = ntohs(sin6->sin6_port);
1668 } else {
1669 struct sockaddr_in *sa;
1670
1671 sa = (struct sockaddr_in *)cm_entry->x_server.buf;
1672 b = (uchar_t *)&sa->sin_addr;
1673 (void) sprintf(fbuf,
1674 "%d.%d.%d.%d", b[0] & 0xFF, b[1] & 0xFF,
1675 b[2] & 0xFF, b[3] & 0xFF);
1676 }
1677 KSTAT_NAMED_STR_BUFLEN(&cm_ksp_data->x_server) =
1678 strlen(fbuf) + 1;
1679 }
1680
1681 return (0);
1682 }
1683
1684
1685 /*
1686 * We want a version of delay which is interruptible by a UNIX signal
1687 * Return EINTR if an interrupt occured.
1688 */
1689 static int
1690 clnt_delay(clock_t ticks, bool_t nosignal)
1691 {
1692 if (nosignal == TRUE) {
1693 delay(ticks);
1694 return (0);
1695 }
1696 return (delay_sig(ticks));
1697 }
1698
1699 /*
1700 * Wait for a connection until a timeout, or until we are
1701 * signalled that there has been a connection state change.
1702 */
1703 static enum clnt_stat
1704 connmgr_cwait(struct cm_xprt *cm_entry, const struct timeval *waitp,
1705 bool_t nosignal)
1706 {
1707 bool_t interrupted;
1708 clock_t timout, cv_stat;
1709 enum clnt_stat clstat;
1710 unsigned int old_state;
1711
1712 ASSERT(MUTEX_HELD(&connmgr_lock));
1713 /*
1714 * We wait for the transport connection to be made, or an
1715 * indication that it could not be made.
1716 */
1717 clstat = RPC_TIMEDOUT;
1718 interrupted = FALSE;
1719
1720 old_state = cm_entry->x_state_flags;
1721 /*
1722 * Now loop until cv_timedwait{_sig} returns because of
1723 * a signal(0) or timeout(-1) or cv_signal(>0). But it may be
1724 * cv_signalled for various other reasons too. So loop
1725 * until there is a state change on the connection.
1726 */
1727
1728 timout = waitp->tv_sec * drv_usectohz(1000000) +
1729 drv_usectohz(waitp->tv_usec) + ddi_get_lbolt();
1730
1731 if (nosignal) {
1732 while ((cv_stat = cv_timedwait(&cm_entry->x_conn_cv,
1733 &connmgr_lock, timout)) > 0 &&
1734 cm_entry->x_state_flags == old_state)
1735 ;
1736 } else {
1737 while ((cv_stat = cv_timedwait_sig(&cm_entry->x_conn_cv,
1738 &connmgr_lock, timout)) > 0 &&
1739 cm_entry->x_state_flags == old_state)
1740 ;
1741
1742 if (cv_stat == 0) /* got intr signal? */
1743 interrupted = TRUE;
1744 }
1745
1746 if ((cm_entry->x_state_flags & (X_BADSTATES|X_CONNECTED)) ==
1747 X_CONNECTED) {
1748 clstat = RPC_SUCCESS;
1749 } else {
1750 if (interrupted == TRUE)
1751 clstat = RPC_INTR;
1752 RPCLOG(1, "connmgr_cwait: can't connect, error: %s\n",
1753 clnt_sperrno(clstat));
1754 }
1755
1756 return (clstat);
1757 }
1758
1759 /*
1760 * Primary interface for how RPC grabs a connection.
1761 */
1762 static struct cm_xprt *
1763 connmgr_wrapget(
1764 struct netbuf *retryaddr,
1765 const struct timeval *waitp,
1766 cku_private_t *p)
1767 {
1768 struct cm_xprt *cm_entry;
1769
1770 cm_entry = connmgr_get(retryaddr, waitp, &p->cku_addr, p->cku_addrfmly,
1771 &p->cku_srcaddr, &p->cku_err, p->cku_device,
1772 p->cku_client.cl_nosignal, p->cku_useresvport, p->cku_cred);
1773
1774 if (cm_entry == NULL) {
1775 /*
1776 * Re-map the call status to RPC_INTR if the err code is
1777 * EINTR. This can happen if calls status is RPC_TLIERROR.
1778 * However, don't re-map if signalling has been turned off.
1779 * XXX Really need to create a separate thread whenever
1780 * there isn't an existing connection.
1781 */
1782 if (p->cku_err.re_errno == EINTR) {
1783 if (p->cku_client.cl_nosignal == TRUE)
1784 p->cku_err.re_errno = EIO;
1785 else
1786 p->cku_err.re_status = RPC_INTR;
1787 }
1788 }
1789
1790 return (cm_entry);
1791 }
1792
1793 /*
1794 * Obtains a transport to the server specified in addr. If a suitable transport
1795 * does not already exist in the list of cached transports, a new connection
1796 * is created, connected, and added to the list. The connection is for sending
1797 * only - the reply message may come back on another transport connection.
1798 *
1799 * To implement round-robin load balancing with multiple client connections,
1800 * the last entry on the list is always selected. Once the entry is selected
1801 * it's re-inserted to the head of the list.
1802 */
1803 static struct cm_xprt *
1804 connmgr_get(
1805 struct netbuf *retryaddr,
1806 const struct timeval *waitp, /* changed to a ptr to converse stack */
1807 struct netbuf *destaddr,
1808 int addrfmly,
1809 struct netbuf *srcaddr,
1810 struct rpc_err *rpcerr,
1811 dev_t device,
1812 bool_t nosignal,
1813 int useresvport,
1814 cred_t *cr)
1815 {
1816 struct cm_xprt *cm_entry;
1817 struct cm_xprt *lru_entry;
1818 struct cm_xprt **cmp, **prev;
1819 queue_t *wq;
1820 TIUSER *tiptr;
1821 int i;
1822 int retval;
1823 int tidu_size;
1824 bool_t connected;
1825 zoneid_t zoneid = rpc_zoneid();
1826
1827 /*
1828 * If the call is not a retry, look for a transport entry that
1829 * goes to the server of interest.
1830 */
1831 mutex_enter(&connmgr_lock);
1832
1833 if (retryaddr == NULL) {
1834 use_new_conn:
1835 i = 0;
1836 cm_entry = lru_entry = NULL;
1837
1838 prev = cmp = &cm_hd;
1839 while ((cm_entry = *cmp) != NULL) {
1840 ASSERT(cm_entry != cm_entry->x_next);
1841 /*
1842 * Garbage collect conections that are marked
1843 * for needs disconnect.
1844 */
1845 if (cm_entry->x_needdis) {
1846 CONN_HOLD(cm_entry);
1847 connmgr_dis_and_wait(cm_entry);
1848 connmgr_release(cm_entry);
1849 /*
1850 * connmgr_lock could have been
1851 * dropped for the disconnect
1852 * processing so start over.
1853 */
1854 goto use_new_conn;
1855 }
1856
1857 /*
1858 * Garbage collect the dead connections that have
1859 * no threads working on them.
1860 */
1861 if ((cm_entry->x_state_flags & (X_DEAD|X_THREAD)) ==
1862 X_DEAD) {
1863 mutex_enter(&cm_entry->x_lock);
1864 if (cm_entry->x_ref != 0) {
1865 /*
1866 * Currently in use.
1867 * Cleanup later.
1868 */
1869 cmp = &cm_entry->x_next;
1870 mutex_exit(&cm_entry->x_lock);
1871 continue;
1872 }
1873 mutex_exit(&cm_entry->x_lock);
1874 *cmp = cm_entry->x_next;
1875 mutex_exit(&connmgr_lock);
1876 connmgr_close(cm_entry);
1877 mutex_enter(&connmgr_lock);
1878 goto use_new_conn;
1879 }
1880
1881
1882 if ((cm_entry->x_state_flags & X_BADSTATES) == 0 &&
1883 cm_entry->x_zoneid == zoneid &&
1884 cm_entry->x_rdev == device &&
1885 destaddr->len == cm_entry->x_server.len &&
1886 bcmp(destaddr->buf, cm_entry->x_server.buf,
1887 destaddr->len) == 0) {
1888 /*
1889 * If the matching entry isn't connected,
1890 * attempt to reconnect it.
1891 */
1892 if (cm_entry->x_connected == FALSE) {
1893 /*
1894 * We don't go through trying
1895 * to find the least recently
1896 * used connected because
1897 * connmgr_reconnect() briefly
1898 * dropped the connmgr_lock,
1899 * allowing a window for our
1900 * accounting to be messed up.
1901 * In any case, a re-connected
1902 * connection is as good as
1903 * a LRU connection.
1904 */
1905 return (connmgr_wrapconnect(cm_entry,
1906 waitp, destaddr, addrfmly, srcaddr,
1907 rpcerr, TRUE, nosignal, cr));
1908 }
1909 i++;
1910
1911 /* keep track of the last entry */
1912 lru_entry = cm_entry;
1913 prev = cmp;
1914 }
1915 cmp = &cm_entry->x_next;
1916 }
1917
1918 if (i > clnt_max_conns) {
1919 RPCLOG(8, "connmgr_get: too many conns, dooming entry"
1920 " %p\n", (void *)lru_entry->x_tiptr);
1921 lru_entry->x_doomed = TRUE;
1922 goto use_new_conn;
1923 }
1924
1925 /*
1926 * If we are at the maximum number of connections to
1927 * the server, hand back the least recently used one.
1928 */
1929 if (i == clnt_max_conns) {
1930 /*
1931 * Copy into the handle the source address of
1932 * the connection, which we will use in case of
1933 * a later retry.
1934 */
1935 if (srcaddr->len != lru_entry->x_src.len) {
1936 if (srcaddr->len > 0)
1937 kmem_free(srcaddr->buf,
1938 srcaddr->maxlen);
1939 srcaddr->buf = kmem_zalloc(
1940 lru_entry->x_src.len, KM_SLEEP);
1941 srcaddr->maxlen = srcaddr->len =
1942 lru_entry->x_src.len;
1943 }
1944 bcopy(lru_entry->x_src.buf, srcaddr->buf, srcaddr->len);
1945 RPCLOG(2, "connmgr_get: call going out on %p\n",
1946 (void *)lru_entry);
1947 lru_entry->x_time = ddi_get_lbolt();
1948 CONN_HOLD(lru_entry);
1949
1950 if ((i > 1) && (prev != &cm_hd)) {
1951 /*
1952 * remove and re-insert entry at head of list.
1953 */
1954 *prev = lru_entry->x_next;
1955 lru_entry->x_next = cm_hd;
1956 cm_hd = lru_entry;
1957 }
1958
1959 mutex_exit(&connmgr_lock);
1960 return (lru_entry);
1961 }
1962
1963 } else {
1964 /*
1965 * This is the retry case (retryaddr != NULL). Retries must
1966 * be sent on the same source port as the original call.
1967 */
1968
1969 /*
1970 * Walk the list looking for a connection with a source address
1971 * that matches the retry address.
1972 */
1973 start_retry_loop:
1974 cmp = &cm_hd;
1975 while ((cm_entry = *cmp) != NULL) {
1976 ASSERT(cm_entry != cm_entry->x_next);
1977
1978 /*
1979 * determine if this connection matches the passed
1980 * in retry address. If it does not match, advance
1981 * to the next element on the list.
1982 */
1983 if (zoneid != cm_entry->x_zoneid ||
1984 device != cm_entry->x_rdev ||
1985 retryaddr->len != cm_entry->x_src.len ||
1986 bcmp(retryaddr->buf, cm_entry->x_src.buf,
1987 retryaddr->len) != 0) {
1988 cmp = &cm_entry->x_next;
1989 continue;
1990 }
1991 /*
1992 * Garbage collect conections that are marked
1993 * for needs disconnect.
1994 */
1995 if (cm_entry->x_needdis) {
1996 CONN_HOLD(cm_entry);
1997 connmgr_dis_and_wait(cm_entry);
1998 connmgr_release(cm_entry);
1999 /*
2000 * connmgr_lock could have been
2001 * dropped for the disconnect
2002 * processing so start over.
2003 */
2004 goto start_retry_loop;
2005 }
2006 /*
2007 * Garbage collect the dead connections that have
2008 * no threads working on them.
2009 */
2010 if ((cm_entry->x_state_flags & (X_DEAD|X_THREAD)) ==
2011 X_DEAD) {
2012 mutex_enter(&cm_entry->x_lock);
2013 if (cm_entry->x_ref != 0) {
2014 /*
2015 * Currently in use.
2016 * Cleanup later.
2017 */
2018 cmp = &cm_entry->x_next;
2019 mutex_exit(&cm_entry->x_lock);
2020 continue;
2021 }
2022 mutex_exit(&cm_entry->x_lock);
2023 *cmp = cm_entry->x_next;
2024 mutex_exit(&connmgr_lock);
2025 connmgr_close(cm_entry);
2026 mutex_enter(&connmgr_lock);
2027 goto start_retry_loop;
2028 }
2029
2030 /*
2031 * Sanity check: if the connection with our source
2032 * port is going to some other server, something went
2033 * wrong, as we never delete connections (i.e. release
2034 * ports) unless they have been idle. In this case,
2035 * it is probably better to send the call out using
2036 * a new source address than to fail it altogether,
2037 * since that port may never be released.
2038 */
2039 if (destaddr->len != cm_entry->x_server.len ||
2040 bcmp(destaddr->buf, cm_entry->x_server.buf,
2041 destaddr->len) != 0) {
2042 RPCLOG(1, "connmgr_get: tiptr %p"
2043 " is going to a different server"
2044 " with the port that belongs"
2045 " to us!\n", (void *)cm_entry->x_tiptr);
2046 retryaddr = NULL;
2047 goto use_new_conn;
2048 }
2049
2050 /*
2051 * If the connection of interest is not connected and we
2052 * can't reconnect it, then the server is probably
2053 * still down. Return NULL to the caller and let it
2054 * retry later if it wants to. We have a delay so the
2055 * machine doesn't go into a tight retry loop. If the
2056 * entry was already connected, or the reconnected was
2057 * successful, return this entry.
2058 */
2059 if (cm_entry->x_connected == FALSE) {
2060 return (connmgr_wrapconnect(cm_entry,
2061 waitp, destaddr, addrfmly, NULL,
2062 rpcerr, TRUE, nosignal, cr));
2063 } else {
2064 CONN_HOLD(cm_entry);
2065
2066 cm_entry->x_time = ddi_get_lbolt();
2067 mutex_exit(&connmgr_lock);
2068 RPCLOG(2, "connmgr_get: found old "
2069 "transport %p for retry\n",
2070 (void *)cm_entry);
2071 return (cm_entry);
2072 }
2073 }
2074
2075 /*
2076 * We cannot find an entry in the list for this retry.
2077 * Either the entry has been removed temporarily to be
2078 * reconnected by another thread, or the original call
2079 * got a port but never got connected,
2080 * and hence the transport never got put in the
2081 * list. Fall through to the "create new connection" code -
2082 * the former case will fail there trying to rebind the port,
2083 * and the later case (and any other pathological cases) will
2084 * rebind and reconnect and not hang the client machine.
2085 */
2086 RPCLOG0(8, "connmgr_get: no entry in list for retry\n");
2087 }
2088 /*
2089 * Set up a transport entry in the connection manager's list.
2090 */
2091 cm_entry = (struct cm_xprt *)
2092 kmem_zalloc(sizeof (struct cm_xprt), KM_SLEEP);
2093
2094 cm_entry->x_server.buf = kmem_zalloc(destaddr->len, KM_SLEEP);
2095 bcopy(destaddr->buf, cm_entry->x_server.buf, destaddr->len);
2096 cm_entry->x_server.len = cm_entry->x_server.maxlen = destaddr->len;
2097
2098 cm_entry->x_state_flags = X_THREAD;
2099 cm_entry->x_ref = 1;
2100 cm_entry->x_family = addrfmly;
2101 cm_entry->x_rdev = device;
2102 cm_entry->x_zoneid = zoneid;
2103 mutex_init(&cm_entry->x_lock, NULL, MUTEX_DEFAULT, NULL);
2104 cv_init(&cm_entry->x_cv, NULL, CV_DEFAULT, NULL);
2105 cv_init(&cm_entry->x_conn_cv, NULL, CV_DEFAULT, NULL);
2106 cv_init(&cm_entry->x_dis_cv, NULL, CV_DEFAULT, NULL);
2107
2108 /*
2109 * Note that we add this partially initialized entry to the
2110 * connection list. This is so that we don't have connections to
2111 * the same server.
2112 *
2113 * Note that x_src is not initialized at this point. This is because
2114 * retryaddr might be NULL in which case x_src is whatever
2115 * t_kbind/bindresvport gives us. If another thread wants a
2116 * connection to the same server, seemingly we have an issue, but we
2117 * don't. If the other thread comes in with retryaddr == NULL, then it
2118 * will never look at x_src, and it will end up waiting in
2119 * connmgr_cwait() for the first thread to finish the connection
2120 * attempt. If the other thread comes in with retryaddr != NULL, then
2121 * that means there was a request sent on a connection, in which case
2122 * the the connection should already exist. Thus the first thread
2123 * never gets here ... it finds the connection it its server in the
2124 * connection list.
2125 *
2126 * But even if theory is wrong, in the retryaddr != NULL case, the 2nd
2127 * thread will skip us because x_src.len == 0.
2128 */
2129 cm_entry->x_next = cm_hd;
2130 cm_hd = cm_entry;
2131 mutex_exit(&connmgr_lock);
2132
2133 /*
2134 * Either we didn't find an entry to the server of interest, or we
2135 * don't have the maximum number of connections to that server -
2136 * create a new connection.
2137 */
2138 RPCLOG0(8, "connmgr_get: creating new connection\n");
2139 rpcerr->re_status = RPC_TLIERROR;
2140
2141 i = t_kopen(NULL, device, FREAD|FWRITE|FNDELAY, &tiptr, zone_kcred());
2142 if (i) {
2143 RPCLOG(1, "connmgr_get: can't open cots device, error %d\n", i);
2144 rpcerr->re_errno = i;
2145 connmgr_cancelconn(cm_entry);
2146 return (NULL);
2147 }
2148 rpc_poptimod(tiptr->fp->f_vnode);
2149
2150 if (i = strioctl(tiptr->fp->f_vnode, I_PUSH, (intptr_t)"rpcmod", 0,
2151 K_TO_K, kcred, &retval)) {
2152 RPCLOG(1, "connmgr_get: can't push cots module, %d\n", i);
2153 (void) t_kclose(tiptr, 1);
2154 rpcerr->re_errno = i;
2155 connmgr_cancelconn(cm_entry);
2156 return (NULL);
2157 }
2158
2159 if (i = strioctl(tiptr->fp->f_vnode, RPC_CLIENT, 0, 0, K_TO_K,
2160 kcred, &retval)) {
2161 RPCLOG(1, "connmgr_get: can't set client status with cots "
2162 "module, %d\n", i);
2163 (void) t_kclose(tiptr, 1);
2164 rpcerr->re_errno = i;
2165 connmgr_cancelconn(cm_entry);
2166 return (NULL);
2167 }
2168
2169 mutex_enter(&connmgr_lock);
2170
2171 wq = tiptr->fp->f_vnode->v_stream->sd_wrq->q_next;
2172 cm_entry->x_wq = wq;
2173
2174 mutex_exit(&connmgr_lock);
2175
2176 if (i = strioctl(tiptr->fp->f_vnode, I_PUSH, (intptr_t)"timod", 0,
2177 K_TO_K, kcred, &retval)) {
2178 RPCLOG(1, "connmgr_get: can't push timod, %d\n", i);
2179 (void) t_kclose(tiptr, 1);
2180 rpcerr->re_errno = i;
2181 connmgr_cancelconn(cm_entry);
2182 return (NULL);
2183 }
2184
2185 /*
2186 * If the caller has not specified reserved port usage then
2187 * take the system default.
2188 */
2189 if (useresvport == -1)
2190 useresvport = clnt_cots_do_bindresvport;
2191
2192 if ((useresvport || retryaddr != NULL) &&
2193 (addrfmly == AF_INET || addrfmly == AF_INET6)) {
2194 bool_t alloc_src = FALSE;
2195
2196 if (srcaddr->len != destaddr->len) {
2197 kmem_free(srcaddr->buf, srcaddr->maxlen);
2198 srcaddr->buf = kmem_zalloc(destaddr->len, KM_SLEEP);
2199 srcaddr->maxlen = destaddr->len;
2200 srcaddr->len = destaddr->len;
2201 alloc_src = TRUE;
2202 }
2203
2204 if ((i = bindresvport(tiptr, retryaddr, srcaddr, TRUE)) != 0) {
2205 (void) t_kclose(tiptr, 1);
2206 RPCLOG(1, "connmgr_get: couldn't bind, retryaddr: "
2207 "%p\n", (void *)retryaddr);
2208
2209 /*
2210 * 1225408: If we allocated a source address, then it
2211 * is either garbage or all zeroes. In that case
2212 * we need to clear srcaddr.
2213 */
2214 if (alloc_src == TRUE) {
2215 kmem_free(srcaddr->buf, srcaddr->maxlen);
2216 srcaddr->maxlen = srcaddr->len = 0;
2217 srcaddr->buf = NULL;
2218 }
2219 rpcerr->re_errno = i;
2220 connmgr_cancelconn(cm_entry);
2221 return (NULL);
2222 }
2223 } else {
2224 if ((i = t_kbind(tiptr, NULL, NULL)) != 0) {
2225 RPCLOG(1, "clnt_cots_kcreate: t_kbind: %d\n", i);
2226 (void) t_kclose(tiptr, 1);
2227 rpcerr->re_errno = i;
2228 connmgr_cancelconn(cm_entry);
2229 return (NULL);
2230 }
2231 }
2232
2233 {
2234 /*
2235 * Keep the kernel stack lean. Don't move this call
2236 * declaration to the top of this function because a
2237 * call is declared in connmgr_wrapconnect()
2238 */
2239 calllist_t call;
2240
2241 bzero(&call, sizeof (call));
2242 cv_init(&call.call_cv, NULL, CV_DEFAULT, NULL);
2243
2244 /*
2245 * This is a bound end-point so don't close it's stream.
2246 */
2247 connected = connmgr_connect(cm_entry, wq, destaddr, addrfmly,
2248 &call, &tidu_size, FALSE, waitp, nosignal, cr);
2249 *rpcerr = call.call_err;
2250 cv_destroy(&call.call_cv);
2251
2252 }
2253
2254 mutex_enter(&connmgr_lock);
2255
2256 /*
2257 * Set up a transport entry in the connection manager's list.
2258 */
2259 cm_entry->x_src.buf = kmem_zalloc(srcaddr->len, KM_SLEEP);
2260 bcopy(srcaddr->buf, cm_entry->x_src.buf, srcaddr->len);
2261 cm_entry->x_src.len = cm_entry->x_src.maxlen = srcaddr->len;
2262
2263 cm_entry->x_tiptr = tiptr;
2264 cm_entry->x_time = ddi_get_lbolt();
2265
2266 if (tiptr->tp_info.servtype == T_COTS_ORD)
2267 cm_entry->x_ordrel = TRUE;
2268 else
2269 cm_entry->x_ordrel = FALSE;
2270
2271 cm_entry->x_tidu_size = tidu_size;
2272
2273 if (cm_entry->x_early_disc) {
2274 /*
2275 * We need to check if a disconnect request has come
2276 * while we are connected, if so, then we need to
2277 * set rpcerr->re_status appropriately before returning
2278 * NULL to caller.
2279 */
2280 if (rpcerr->re_status == RPC_SUCCESS)
2281 rpcerr->re_status = RPC_XPRTFAILED;
2282 cm_entry->x_connected = FALSE;
2283 cm_entry->x_dead = TRUE;
2284 } else
2285 cm_entry->x_connected = connected;
2286
2287 /*
2288 * There could be a discrepancy here such that
2289 * x_early_disc is TRUE yet connected is TRUE as well
2290 * and the connection is actually connected. In that case
2291 * lets be conservative and declare the connection as not
2292 * connected.
2293 */
2294 cm_entry->x_early_disc = FALSE;
2295 cm_entry->x_needdis = (cm_entry->x_connected == FALSE);
2296 cm_entry->x_ctime = ddi_get_lbolt();
2297
2298 /*
2299 * Notify any threads waiting that the connection attempt is done.
2300 */
2301 cm_entry->x_thread = FALSE;
2302 cv_broadcast(&cm_entry->x_conn_cv);
2303
2304 if (cm_entry->x_connected == FALSE) {
2305 mutex_exit(&connmgr_lock);
2306 connmgr_release(cm_entry);
2307 return (NULL);
2308 }
2309
2310 mutex_exit(&connmgr_lock);
2311
2312 return (cm_entry);
2313 }
2314
2315 /*
2316 * Keep the cm_xprt entry on the connecton list when making a connection. This
2317 * is to prevent multiple connections to a slow server from appearing.
2318 * We use the bit field x_thread to tell if a thread is doing a connection
2319 * which keeps other interested threads from messing with connection.
2320 * Those other threads just wait if x_thread is set.
2321 *
2322 * If x_thread is not set, then we do the actual work of connecting via
2323 * connmgr_connect().
2324 *
2325 * mutex convention: called with connmgr_lock held, returns with it released.
2326 */
2327 static struct cm_xprt *
2328 connmgr_wrapconnect(
2329 struct cm_xprt *cm_entry,
2330 const struct timeval *waitp,
2331 struct netbuf *destaddr,
2332 int addrfmly,
2333 struct netbuf *srcaddr,
2334 struct rpc_err *rpcerr,
2335 bool_t reconnect,
2336 bool_t nosignal,
2337 cred_t *cr)
2338 {
2339 ASSERT(MUTEX_HELD(&connmgr_lock));
2340 /*
2341 * Hold this entry as we are about to drop connmgr_lock.
2342 */
2343 CONN_HOLD(cm_entry);
2344
2345 /*
2346 * If there is a thread already making a connection for us, then
2347 * wait for it to complete the connection.
2348 */
2349 if (cm_entry->x_thread == TRUE) {
2350 rpcerr->re_status = connmgr_cwait(cm_entry, waitp, nosignal);
2351
2352 if (rpcerr->re_status != RPC_SUCCESS) {
2353 mutex_exit(&connmgr_lock);
2354 connmgr_release(cm_entry);
2355 return (NULL);
2356 }
2357 } else {
2358 bool_t connected;
2359 calllist_t call;
2360
2361 cm_entry->x_thread = TRUE;
2362
2363 while (cm_entry->x_needrel == TRUE) {
2364 cm_entry->x_needrel = FALSE;
2365
2366 connmgr_sndrel(cm_entry);
2367 delay(drv_usectohz(1000000));
2368
2369 mutex_enter(&connmgr_lock);
2370 }
2371
2372 /*
2373 * If we need to send a T_DISCON_REQ, send one.
2374 */
2375 connmgr_dis_and_wait(cm_entry);
2376
2377 mutex_exit(&connmgr_lock);
2378
2379 bzero(&call, sizeof (call));
2380 cv_init(&call.call_cv, NULL, CV_DEFAULT, NULL);
2381
2382 connected = connmgr_connect(cm_entry, cm_entry->x_wq,
2383 destaddr, addrfmly, &call, &cm_entry->x_tidu_size,
2384 reconnect, waitp, nosignal, cr);
2385
2386 *rpcerr = call.call_err;
2387 cv_destroy(&call.call_cv);
2388
2389 mutex_enter(&connmgr_lock);
2390
2391
2392 if (cm_entry->x_early_disc) {
2393 /*
2394 * We need to check if a disconnect request has come
2395 * while we are connected, if so, then we need to
2396 * set rpcerr->re_status appropriately before returning
2397 * NULL to caller.
2398 */
2399 if (rpcerr->re_status == RPC_SUCCESS)
2400 rpcerr->re_status = RPC_XPRTFAILED;
2401 cm_entry->x_connected = FALSE;
2402 cm_entry->x_dead = TRUE;
2403 } else
2404 cm_entry->x_connected = connected;
2405
2406 /*
2407 * There could be a discrepancy here such that
2408 * x_early_disc is TRUE yet connected is TRUE as well
2409 * and the connection is actually connected. In that case
2410 * lets be conservative and declare the connection as not
2411 * connected.
2412 */
2413
2414 cm_entry->x_early_disc = FALSE;
2415 cm_entry->x_needdis = (cm_entry->x_connected == FALSE);
2416
2417
2418 /*
2419 * connmgr_connect() may have given up before the connection
2420 * actually timed out. So ensure that before the next
2421 * connection attempt we do a disconnect.
2422 */
2423 cm_entry->x_ctime = ddi_get_lbolt();
2424 cm_entry->x_thread = FALSE;
2425
2426 cv_broadcast(&cm_entry->x_conn_cv);
2427
2428 if (cm_entry->x_connected == FALSE) {
2429 mutex_exit(&connmgr_lock);
2430 connmgr_release(cm_entry);
2431 return (NULL);
2432 }
2433 }
2434
2435 if (srcaddr != NULL) {
2436 /*
2437 * Copy into the handle the
2438 * source address of the
2439 * connection, which we will use
2440 * in case of a later retry.
2441 */
2442 if (srcaddr->len != cm_entry->x_src.len) {
2443 if (srcaddr->maxlen > 0)
2444 kmem_free(srcaddr->buf, srcaddr->maxlen);
2445 srcaddr->buf = kmem_zalloc(cm_entry->x_src.len,
2446 KM_SLEEP);
2447 srcaddr->maxlen = srcaddr->len =
2448 cm_entry->x_src.len;
2449 }
2450 bcopy(cm_entry->x_src.buf, srcaddr->buf, srcaddr->len);
2451 }
2452 cm_entry->x_time = ddi_get_lbolt();
2453 mutex_exit(&connmgr_lock);
2454 return (cm_entry);
2455 }
2456
2457 /*
2458 * If we need to send a T_DISCON_REQ, send one.
2459 */
2460 static void
2461 connmgr_dis_and_wait(struct cm_xprt *cm_entry)
2462 {
2463 ASSERT(MUTEX_HELD(&connmgr_lock));
2464 for (;;) {
2465 while (cm_entry->x_needdis == TRUE) {
2466 RPCLOG(8, "connmgr_dis_and_wait: need "
2467 "T_DISCON_REQ for connection 0x%p\n",
2468 (void *)cm_entry);
2469 cm_entry->x_needdis = FALSE;
2470 cm_entry->x_waitdis = TRUE;
2471
2472 connmgr_snddis(cm_entry);
2473
2474 mutex_enter(&connmgr_lock);
2475 }
2476
2477 if (cm_entry->x_waitdis == TRUE) {
2478 clock_t timout;
2479
2480 RPCLOG(8, "connmgr_dis_and_wait waiting for "
2481 "T_DISCON_REQ's ACK for connection %p\n",
2482 (void *)cm_entry);
2483
2484 timout = clnt_cots_min_conntout * drv_usectohz(1000000);
2485
2486 /*
2487 * The TPI spec says that the T_DISCON_REQ
2488 * will get acknowledged, but in practice
2489 * the ACK may never get sent. So don't
2490 * block forever.
2491 */
2492 (void) cv_reltimedwait(&cm_entry->x_dis_cv,
2493 &connmgr_lock, timout, TR_CLOCK_TICK);
2494 }
2495 /*
2496 * If we got the ACK, break. If we didn't,
2497 * then send another T_DISCON_REQ.
2498 */
2499 if (cm_entry->x_waitdis == FALSE) {
2500 break;
2501 } else {
2502 RPCLOG(8, "connmgr_dis_and_wait: did"
2503 "not get T_DISCON_REQ's ACK for "
2504 "connection %p\n", (void *)cm_entry);
2505 cm_entry->x_needdis = TRUE;
2506 }
2507 }
2508 }
2509
2510 static void
2511 connmgr_cancelconn(struct cm_xprt *cm_entry)
2512 {
2513 /*
2514 * Mark the connection table entry as dead; the next thread that
2515 * goes through connmgr_release() will notice this and deal with it.
2516 */
2517 mutex_enter(&connmgr_lock);
2518 cm_entry->x_dead = TRUE;
2519
2520 /*
2521 * Notify any threads waiting for the connection that it isn't
2522 * going to happen.
2523 */
2524 cm_entry->x_thread = FALSE;
2525 cv_broadcast(&cm_entry->x_conn_cv);
2526 mutex_exit(&connmgr_lock);
2527
2528 connmgr_release(cm_entry);
2529 }
2530
2531 static void
2532 connmgr_close(struct cm_xprt *cm_entry)
2533 {
2534 mutex_enter(&cm_entry->x_lock);
2535 while (cm_entry->x_ref != 0) {
2536 /*
2537 * Must be a noninterruptible wait.
2538 */
2539 cv_wait(&cm_entry->x_cv, &cm_entry->x_lock);
2540 }
2541
2542 if (cm_entry->x_tiptr != NULL)
2543 (void) t_kclose(cm_entry->x_tiptr, 1);
2544
2545 mutex_exit(&cm_entry->x_lock);
2546 if (cm_entry->x_ksp != NULL) {
2547 mutex_enter(&connmgr_lock);
2548 cm_entry->x_ksp->ks_private = NULL;
2549 mutex_exit(&connmgr_lock);
2550
2551 /*
2552 * Must free the buffer we allocated for the
2553 * server address in the update function
2554 */
2555 if (((struct cm_kstat_xprt *)(cm_entry->x_ksp->ks_data))->
2556 x_server.value.str.addr.ptr != NULL)
2557 kmem_free(((struct cm_kstat_xprt *)(cm_entry->x_ksp->
2558 ks_data))->x_server.value.str.addr.ptr,
2559 INET6_ADDRSTRLEN);
2560 kmem_free(cm_entry->x_ksp->ks_data,
2561 cm_entry->x_ksp->ks_data_size);
2562 kstat_delete(cm_entry->x_ksp);
2563 }
2564
2565 mutex_destroy(&cm_entry->x_lock);
2566 cv_destroy(&cm_entry->x_cv);
2567 cv_destroy(&cm_entry->x_conn_cv);
2568 cv_destroy(&cm_entry->x_dis_cv);
2569
2570 if (cm_entry->x_server.buf != NULL)
2571 kmem_free(cm_entry->x_server.buf, cm_entry->x_server.maxlen);
2572 if (cm_entry->x_src.buf != NULL)
2573 kmem_free(cm_entry->x_src.buf, cm_entry->x_src.maxlen);
2574 kmem_free(cm_entry, sizeof (struct cm_xprt));
2575 }
2576
2577 /*
2578 * Called by KRPC after sending the call message to release the connection
2579 * it was using.
2580 */
2581 static void
2582 connmgr_release(struct cm_xprt *cm_entry)
2583 {
2584 mutex_enter(&cm_entry->x_lock);
2585 cm_entry->x_ref--;
2586 if (cm_entry->x_ref == 0)
2587 cv_signal(&cm_entry->x_cv);
2588 mutex_exit(&cm_entry->x_lock);
2589 }
2590
2591 /*
2592 * Set TCP receive and xmit buffer size for RPC connections.
2593 */
2594 static bool_t
2595 connmgr_setbufsz(calllist_t *e, queue_t *wq, cred_t *cr)
2596 {
2597 int ok = FALSE;
2598 int val;
2599
2600 if (rpc_default_tcp_bufsz)
2601 return (FALSE);
2602
2603 /*
2604 * Only set new buffer size if it's larger than the system
2605 * default buffer size. If smaller buffer size is needed
2606 * then use /etc/system to set rpc_default_tcp_bufsz to 1.
2607 */
2608 ok = connmgr_getopt_int(wq, SOL_SOCKET, SO_RCVBUF, &val, e, cr);
2609 if ((ok == TRUE) && (val < rpc_send_bufsz)) {
2610 ok = connmgr_setopt_int(wq, SOL_SOCKET, SO_RCVBUF,
2611 rpc_send_bufsz, e, cr);
2612 DTRACE_PROBE2(krpc__i__connmgr_rcvbufsz,
2613 int, ok, calllist_t *, e);
2614 }
2615
2616 ok = connmgr_getopt_int(wq, SOL_SOCKET, SO_SNDBUF, &val, e, cr);
2617 if ((ok == TRUE) && (val < rpc_recv_bufsz)) {
2618 ok = connmgr_setopt_int(wq, SOL_SOCKET, SO_SNDBUF,
2619 rpc_recv_bufsz, e, cr);
2620 DTRACE_PROBE2(krpc__i__connmgr_sndbufsz,
2621 int, ok, calllist_t *, e);
2622 }
2623 return (TRUE);
2624 }
2625
2626 /*
2627 * Given an open stream, connect to the remote. Returns true if connected,
2628 * false otherwise.
2629 */
2630 static bool_t
2631 connmgr_connect(
2632 struct cm_xprt *cm_entry,
2633 queue_t *wq,
2634 struct netbuf *addr,
2635 int addrfmly,
2636 calllist_t *e,
2637 int *tidu_ptr,
2638 bool_t reconnect,
2639 const struct timeval *waitp,
2640 bool_t nosignal,
2641 cred_t *cr)
2642 {
2643 mblk_t *mp;
2644 struct T_conn_req *tcr;
2645 struct T_info_ack *tinfo;
2646 int interrupted, error;
2647 int tidu_size, kstat_instance;
2648
2649 /* if it's a reconnect, flush any lingering data messages */
2650 if (reconnect)
2651 (void) putctl1(wq, M_FLUSH, FLUSHRW);
2652
2653 /*
2654 * Note: if the receiver uses SCM_UCRED/getpeerucred the pid will
2655 * appear as -1.
2656 */
2657 mp = allocb_cred(sizeof (*tcr) + addr->len, cr, NOPID);
2658 if (mp == NULL) {
2659 /*
2660 * This is unfortunate, but we need to look up the stats for
2661 * this zone to increment the "memory allocation failed"
2662 * counter. curproc->p_zone is safe since we're initiating a
2663 * connection and not in some strange streams context.
2664 */
2665 struct rpcstat *rpcstat;
2666
2667 rpcstat = zone_getspecific(rpcstat_zone_key, rpc_zone());
2668 ASSERT(rpcstat != NULL);
2669
2670 RPCLOG0(1, "connmgr_connect: cannot alloc mp for "
2671 "sending conn request\n");
2672 COTSRCSTAT_INCR(rpcstat->rpc_cots_client, rcnomem);
2673 e->call_status = RPC_SYSTEMERROR;
2674 e->call_reason = ENOSR;
2675 return (FALSE);
2676 }
2677
2678 /* Set TCP buffer size for RPC connections if needed */
2679 if (addrfmly == AF_INET || addrfmly == AF_INET6)
2680 (void) connmgr_setbufsz(e, wq, cr);
2681
2682 mp->b_datap->db_type = M_PROTO;
2683 tcr = (struct T_conn_req *)mp->b_rptr;
2684 bzero(tcr, sizeof (*tcr));
2685 tcr->PRIM_type = T_CONN_REQ;
2686 tcr->DEST_length = addr->len;
2687 tcr->DEST_offset = sizeof (struct T_conn_req);
2688 mp->b_wptr = mp->b_rptr + sizeof (*tcr);
2689
2690 bcopy(addr->buf, mp->b_wptr, tcr->DEST_length);
2691 mp->b_wptr += tcr->DEST_length;
2692
2693 RPCLOG(8, "connmgr_connect: sending conn request on queue "
2694 "%p", (void *)wq);
2695 RPCLOG(8, " call %p\n", (void *)wq);
2696 /*
2697 * We use the entry in the handle that is normally used for
2698 * waiting for RPC replies to wait for the connection accept.
2699 */
2700 if (clnt_dispatch_send(wq, mp, e, 0, 0) != RPC_SUCCESS) {
2701 DTRACE_PROBE(krpc__e__connmgr__connect__cantsend);
2702 freemsg(mp);
2703 return (FALSE);
2704 }
2705
2706 mutex_enter(&clnt_pending_lock);
2707
2708 /*
2709 * We wait for the transport connection to be made, or an
2710 * indication that it could not be made.
2711 */
2712 interrupted = 0;
2713
2714 /*
2715 * waitforack should have been called with T_OK_ACK, but the
2716 * present implementation needs to be passed T_INFO_ACK to
2717 * work correctly.
2718 */
2719 error = waitforack(e, T_INFO_ACK, waitp, nosignal);
2720 if (error == EINTR)
2721 interrupted = 1;
2722 if (zone_status_get(curproc->p_zone) >= ZONE_IS_EMPTY) {
2723 /*
2724 * No time to lose; we essentially have been signaled to
2725 * quit.
2726 */
2727 interrupted = 1;
2728 }
2729 #ifdef RPCDEBUG
2730 if (error == ETIME)
2731 RPCLOG0(8, "connmgr_connect: giving up "
2732 "on connection attempt; "
2733 "clnt_dispatch notifyconn "
2734 "diagnostic 'no one waiting for "
2735 "connection' should not be "
2736 "unexpected\n");
2737 #endif
2738 if (e->call_prev)
2739 e->call_prev->call_next = e->call_next;
2740 else
2741 clnt_pending = e->call_next;
2742 if (e->call_next)
2743 e->call_next->call_prev = e->call_prev;
2744 mutex_exit(&clnt_pending_lock);
2745
2746 if (e->call_status != RPC_SUCCESS || error != 0) {
2747 if (interrupted)
2748 e->call_status = RPC_INTR;
2749 else if (error == ETIME)
2750 e->call_status = RPC_TIMEDOUT;
2751 else if (error == EPROTO) {
2752 e->call_status = RPC_SYSTEMERROR;
2753 e->call_reason = EPROTO;
2754 }
2755
2756 RPCLOG(8, "connmgr_connect: can't connect, status: "
2757 "%s\n", clnt_sperrno(e->call_status));
2758
2759 if (e->call_reply) {
2760 freemsg(e->call_reply);
2761 e->call_reply = NULL;
2762 }
2763
2764 return (FALSE);
2765 }
2766 /*
2767 * The result of the "connection accept" is a T_info_ack
2768 * in the call_reply field.
2769 */
2770 ASSERT(e->call_reply != NULL);
2771 mp = e->call_reply;
2772 e->call_reply = NULL;
2773 tinfo = (struct T_info_ack *)mp->b_rptr;
2774
2775 tidu_size = tinfo->TIDU_size;
2776 tidu_size -= (tidu_size % BYTES_PER_XDR_UNIT);
2777 if (tidu_size > COTS_DEFAULT_ALLOCSIZE || (tidu_size <= 0))
2778 tidu_size = COTS_DEFAULT_ALLOCSIZE;
2779 *tidu_ptr = tidu_size;
2780
2781 freemsg(mp);
2782
2783 /*
2784 * Set up the pertinent options. NODELAY is so the transport doesn't
2785 * buffer up RPC messages on either end. This may not be valid for
2786 * all transports. Failure to set this option is not cause to
2787 * bail out so we return success anyway. Note that lack of NODELAY
2788 * or some other way to flush the message on both ends will cause
2789 * lots of retries and terrible performance.
2790 */
2791 if (addrfmly == AF_INET || addrfmly == AF_INET6) {
2792 (void) connmgr_setopt(wq, IPPROTO_TCP, TCP_NODELAY, e, cr);
2793 if (e->call_status == RPC_XPRTFAILED)
2794 return (FALSE);
2795 }
2796
2797 /*
2798 * Since we have a connection, we now need to figure out if
2799 * we need to create a kstat. If x_ksp is not NULL then we
2800 * are reusing a connection and so we do not need to create
2801 * another kstat -- lets just return.
2802 */
2803 if (cm_entry->x_ksp != NULL)
2804 return (TRUE);
2805
2806 /*
2807 * We need to increment rpc_kstat_instance atomically to prevent
2808 * two kstats being created with the same instance.
2809 */
2810 kstat_instance = atomic_inc_32_nv((uint32_t *)&rpc_kstat_instance);
2811
2812 if ((cm_entry->x_ksp = kstat_create_zone("unix", kstat_instance,
2813 "rpc_cots_connections", "rpc", KSTAT_TYPE_NAMED,
2814 (uint_t)(sizeof (cm_kstat_xprt_t) / sizeof (kstat_named_t)),
2815 KSTAT_FLAG_VIRTUAL, cm_entry->x_zoneid)) == NULL) {
2816 return (TRUE);
2817 }
2818
2819 cm_entry->x_ksp->ks_lock = &connmgr_lock;
2820 cm_entry->x_ksp->ks_private = cm_entry;
2821 cm_entry->x_ksp->ks_data_size = ((INET6_ADDRSTRLEN * sizeof (char))
2822 + sizeof (cm_kstat_template));
2823 cm_entry->x_ksp->ks_data = kmem_alloc(cm_entry->x_ksp->ks_data_size,
2824 KM_SLEEP);
2825 bcopy(&cm_kstat_template, cm_entry->x_ksp->ks_data,
2826 cm_entry->x_ksp->ks_data_size);
2827 ((struct cm_kstat_xprt *)(cm_entry->x_ksp->ks_data))->
2828 x_server.value.str.addr.ptr =
2829 kmem_alloc(INET6_ADDRSTRLEN, KM_SLEEP);
2830
2831 cm_entry->x_ksp->ks_update = conn_kstat_update;
2832 kstat_install(cm_entry->x_ksp);
2833 return (TRUE);
2834 }
2835
2836 /*
2837 * Verify that the specified offset falls within the mblk and
2838 * that the resulting pointer is aligned.
2839 * Returns NULL if not.
2840 *
2841 * code from fs/sockfs/socksubr.c
2842 */
2843 static void *
2844 connmgr_opt_getoff(mblk_t *mp, t_uscalar_t offset,
2845 t_uscalar_t length, uint_t align_size)
2846 {
2847 uintptr_t ptr1, ptr2;
2848
2849 ASSERT(mp && mp->b_wptr >= mp->b_rptr);
2850 ptr1 = (uintptr_t)mp->b_rptr + offset;
2851 ptr2 = (uintptr_t)ptr1 + length;
2852 if (ptr1 < (uintptr_t)mp->b_rptr || ptr2 > (uintptr_t)mp->b_wptr) {
2853 return (NULL);
2854 }
2855 if ((ptr1 & (align_size - 1)) != 0) {
2856 return (NULL);
2857 }
2858 return ((void *)ptr1);
2859 }
2860
2861 static bool_t
2862 connmgr_getopt_int(queue_t *wq, int level, int name, int *val,
2863 calllist_t *e, cred_t *cr)
2864 {
2865 mblk_t *mp;
2866 struct opthdr *opt, *opt_res;
2867 struct T_optmgmt_req *tor;
2868 struct T_optmgmt_ack *opt_ack;
2869 struct timeval waitp;
2870 int error;
2871
2872 mp = allocb_cred(sizeof (struct T_optmgmt_req) +
2873 sizeof (struct opthdr) + sizeof (int), cr, NOPID);
2874 if (mp == NULL)
2875 return (FALSE);
2876
2877 mp->b_datap->db_type = M_PROTO;
2878 tor = (struct T_optmgmt_req *)(mp->b_rptr);
2879 tor->PRIM_type = T_SVR4_OPTMGMT_REQ;
2880 tor->MGMT_flags = T_CURRENT;
2881 tor->OPT_length = sizeof (struct opthdr) + sizeof (int);
2882 tor->OPT_offset = sizeof (struct T_optmgmt_req);
2883
2884 opt = (struct opthdr *)(mp->b_rptr + sizeof (struct T_optmgmt_req));
2885 opt->level = level;
2886 opt->name = name;
2887 opt->len = sizeof (int);
2888 mp->b_wptr += sizeof (struct T_optmgmt_req) + sizeof (struct opthdr) +
2889 sizeof (int);
2890
2891 /*
2892 * We will use this connection regardless
2893 * of whether or not the option is readable.
2894 */
2895 if (clnt_dispatch_send(wq, mp, e, 0, 0) != RPC_SUCCESS) {
2896 DTRACE_PROBE(krpc__e__connmgr__getopt__cantsend);
2897 freemsg(mp);
2898 return (FALSE);
2899 }
2900
2901 mutex_enter(&clnt_pending_lock);
2902
2903 waitp.tv_sec = clnt_cots_min_conntout;
2904 waitp.tv_usec = 0;
2905 error = waitforack(e, T_OPTMGMT_ACK, &waitp, 1);
2906
2907 if (e->call_prev)
2908 e->call_prev->call_next = e->call_next;
2909 else
2910 clnt_pending = e->call_next;
2911 if (e->call_next)
2912 e->call_next->call_prev = e->call_prev;
2913 mutex_exit(&clnt_pending_lock);
2914
2915 /* get reply message */
2916 mp = e->call_reply;
2917 e->call_reply = NULL;
2918
2919 if ((!mp) || (e->call_status != RPC_SUCCESS) || (error != 0)) {
2920
2921 DTRACE_PROBE4(krpc__e__connmgr_getopt, int, name,
2922 int, e->call_status, int, error, mblk_t *, mp);
2923
2924 if (mp)
2925 freemsg(mp);
2926 return (FALSE);
2927 }
2928
2929 opt_ack = (struct T_optmgmt_ack *)mp->b_rptr;
2930 opt_res = (struct opthdr *)connmgr_opt_getoff(mp, opt_ack->OPT_offset,
2931 opt_ack->OPT_length, __TPI_ALIGN_SIZE);
2932
2933 if (!opt_res) {
2934 DTRACE_PROBE4(krpc__e__connmgr_optres, mblk_t *, mp, int, name,
2935 int, opt_ack->OPT_offset, int, opt_ack->OPT_length);
2936 freemsg(mp);
2937 return (FALSE);
2938 }
2939 *val = *(int *)&opt_res[1];
2940
2941 DTRACE_PROBE2(connmgr_getopt__ok, int, name, int, *val);
2942
2943 freemsg(mp);
2944 return (TRUE);
2945 }
2946
2947 /*
2948 * Called by connmgr_connect to set an option on the new stream.
2949 */
2950 static bool_t
2951 connmgr_setopt_int(queue_t *wq, int level, int name, int val,
2952 calllist_t *e, cred_t *cr)
2953 {
2954 mblk_t *mp;
2955 struct opthdr *opt;
2956 struct T_optmgmt_req *tor;
2957 struct timeval waitp;
2958 int error;
2959
2960 mp = allocb_cred(sizeof (struct T_optmgmt_req) +
2961 sizeof (struct opthdr) + sizeof (int), cr, NOPID);
2962 if (mp == NULL) {
2963 RPCLOG0(1, "connmgr_setopt: cannot alloc mp for option "
2964 "request\n");
2965 return (FALSE);
2966 }
2967
2968 mp->b_datap->db_type = M_PROTO;
2969 tor = (struct T_optmgmt_req *)(mp->b_rptr);
2970 tor->PRIM_type = T_SVR4_OPTMGMT_REQ;
2971 tor->MGMT_flags = T_NEGOTIATE;
2972 tor->OPT_length = sizeof (struct opthdr) + sizeof (int);
2973 tor->OPT_offset = sizeof (struct T_optmgmt_req);
2974
2975 opt = (struct opthdr *)(mp->b_rptr + sizeof (struct T_optmgmt_req));
2976 opt->level = level;
2977 opt->name = name;
2978 opt->len = sizeof (int);
2979 *(int *)((char *)opt + sizeof (*opt)) = val;
2980 mp->b_wptr += sizeof (struct T_optmgmt_req) + sizeof (struct opthdr) +
2981 sizeof (int);
2982
2983 /*
2984 * We will use this connection regardless
2985 * of whether or not the option is settable.
2986 */
2987 if (clnt_dispatch_send(wq, mp, e, 0, 0) != RPC_SUCCESS) {
2988 DTRACE_PROBE(krpc__e__connmgr__setopt__cantsend);
2989 freemsg(mp);
2990 return (FALSE);
2991 }
2992
2993 mutex_enter(&clnt_pending_lock);
2994
2995 waitp.tv_sec = clnt_cots_min_conntout;
2996 waitp.tv_usec = 0;
2997 error = waitforack(e, T_OPTMGMT_ACK, &waitp, 1);
2998
2999 if (e->call_prev)
3000 e->call_prev->call_next = e->call_next;
3001 else
3002 clnt_pending = e->call_next;
3003 if (e->call_next)
3004 e->call_next->call_prev = e->call_prev;
3005 mutex_exit(&clnt_pending_lock);
3006
3007 if (e->call_reply != NULL) {
3008 freemsg(e->call_reply);
3009 e->call_reply = NULL;
3010 }
3011
3012 if (e->call_status != RPC_SUCCESS || error != 0) {
3013 RPCLOG(1, "connmgr_setopt: can't set option: %d\n", name);
3014 return (FALSE);
3015 }
3016 RPCLOG(8, "connmgr_setopt: successfully set option: %d\n", name);
3017 return (TRUE);
3018 }
3019
3020 static bool_t
3021 connmgr_setopt(queue_t *wq, int level, int name, calllist_t *e, cred_t *cr)
3022 {
3023 return (connmgr_setopt_int(wq, level, name, 1, e, cr));
3024 }
3025
3026 #ifdef DEBUG
3027
3028 /*
3029 * This is a knob to let us force code coverage in allocation failure
3030 * case.
3031 */
3032 static int connmgr_failsnd;
3033 #define CONN_SND_ALLOC(Size, Pri) \
3034 ((connmgr_failsnd-- > 0) ? NULL : allocb(Size, Pri))
3035
3036 #else
3037
3038 #define CONN_SND_ALLOC(Size, Pri) allocb(Size, Pri)
3039
3040 #endif
3041
3042 /*
3043 * Sends an orderly release on the specified queue.
3044 * Entered with connmgr_lock. Exited without connmgr_lock
3045 */
3046 static void
3047 connmgr_sndrel(struct cm_xprt *cm_entry)
3048 {
3049 struct T_ordrel_req *torr;
3050 mblk_t *mp;
3051 queue_t *q = cm_entry->x_wq;
3052 ASSERT(MUTEX_HELD(&connmgr_lock));
3053 mp = CONN_SND_ALLOC(sizeof (struct T_ordrel_req), BPRI_LO);
3054 if (mp == NULL) {
3055 cm_entry->x_needrel = TRUE;
3056 mutex_exit(&connmgr_lock);
3057 RPCLOG(1, "connmgr_sndrel: cannot alloc mp for sending ordrel "
3058 "to queue %p\n", (void *)q);
3059 return;
3060 }
3061 mutex_exit(&connmgr_lock);
3062
3063 mp->b_datap->db_type = M_PROTO;
3064 torr = (struct T_ordrel_req *)(mp->b_rptr);
3065 torr->PRIM_type = T_ORDREL_REQ;
3066 mp->b_wptr = mp->b_rptr + sizeof (struct T_ordrel_req);
3067
3068 RPCLOG(8, "connmgr_sndrel: sending ordrel to queue %p\n", (void *)q);
3069 put(q, mp);
3070 }
3071
3072 /*
3073 * Sends an disconnect on the specified queue.
3074 * Entered with connmgr_lock. Exited without connmgr_lock
3075 */
3076 static void
3077 connmgr_snddis(struct cm_xprt *cm_entry)
3078 {
3079 struct T_discon_req *tdis;
3080 mblk_t *mp;
3081 queue_t *q = cm_entry->x_wq;
3082
3083 ASSERT(MUTEX_HELD(&connmgr_lock));
3084 mp = CONN_SND_ALLOC(sizeof (*tdis), BPRI_LO);
3085 if (mp == NULL) {
3086 cm_entry->x_needdis = TRUE;
3087 mutex_exit(&connmgr_lock);
3088 RPCLOG(1, "connmgr_snddis: cannot alloc mp for sending discon "
3089 "to queue %p\n", (void *)q);
3090 return;
3091 }
3092 mutex_exit(&connmgr_lock);
3093
3094 mp->b_datap->db_type = M_PROTO;
3095 tdis = (struct T_discon_req *)mp->b_rptr;
3096 tdis->PRIM_type = T_DISCON_REQ;
3097 mp->b_wptr = mp->b_rptr + sizeof (*tdis);
3098
3099 RPCLOG(8, "connmgr_snddis: sending discon to queue %p\n", (void *)q);
3100 put(q, mp);
3101 }
3102
3103 /*
3104 * Sets up the entry for receiving replies, and calls rpcmod's write put proc
3105 * (through put) to send the call.
3106 */
3107 static int
3108 clnt_dispatch_send(queue_t *q, mblk_t *mp, calllist_t *e, uint_t xid,
3109 uint_t queue_flag)
3110 {
3111 ASSERT(e != NULL);
3112
3113 e->call_status = RPC_TIMEDOUT; /* optimistic, eh? */
3114 e->call_reason = 0;
3115 e->call_wq = q;
3116 e->call_xid = xid;
3117 e->call_notified = FALSE;
3118
3119 if (!canput(q)) {
3120 e->call_status = RPC_CANTSEND;
3121 e->call_reason = ENOBUFS;
3122 return (RPC_CANTSEND);
3123 }
3124
3125 /*
3126 * If queue_flag is set then the calllist_t is already on the hash
3127 * queue. In this case just send the message and return.
3128 */
3129 if (queue_flag) {
3130 put(q, mp);
3131 return (RPC_SUCCESS);
3132
3133 }
3134
3135 /*
3136 * Set up calls for RPC requests (with XID != 0) on the hash
3137 * queue for fast lookups and place other calls (i.e.
3138 * connection management) on the linked list.
3139 */
3140 if (xid != 0) {
3141 RPCLOG(64, "clnt_dispatch_send: putting xid 0x%x on "
3142 "dispatch list\n", xid);
3143 e->call_hash = call_hash(xid, clnt_cots_hash_size);
3144 e->call_bucket = &cots_call_ht[e->call_hash];
3145 call_table_enter(e);
3146 } else {
3147 mutex_enter(&clnt_pending_lock);
3148 if (clnt_pending)
3149 clnt_pending->call_prev = e;
3150 e->call_next = clnt_pending;
3151 e->call_prev = NULL;
3152 clnt_pending = e;
3153 mutex_exit(&clnt_pending_lock);
3154 }
3155
3156 put(q, mp);
3157 return (RPC_SUCCESS);
3158 }
3159
3160 /*
3161 * Called by rpcmod to notify a client with a clnt_pending call that its reply
3162 * has arrived. If we can't find a client waiting for this reply, we log
3163 * the error and return.
3164 */
3165 bool_t
3166 clnt_dispatch_notify(mblk_t *mp, zoneid_t zoneid)
3167 {
3168 calllist_t *e = NULL;
3169 call_table_t *chtp;
3170 uint32_t xid;
3171 uint_t hash;
3172
3173 if ((IS_P2ALIGNED(mp->b_rptr, sizeof (uint32_t))) &&
3174 (mp->b_wptr - mp->b_rptr) >= sizeof (xid))
3175 xid = *((uint32_t *)mp->b_rptr);
3176 else {
3177 int i = 0;
3178 unsigned char *p = (unsigned char *)&xid;
3179 unsigned char *rptr;
3180 mblk_t *tmp = mp;
3181
3182 /*
3183 * Copy the xid, byte-by-byte into xid.
3184 */
3185 while (tmp) {
3186 rptr = tmp->b_rptr;
3187 while (rptr < tmp->b_wptr) {
3188 *p++ = *rptr++;
3189 if (++i >= sizeof (xid))
3190 goto done_xid_copy;
3191 }
3192 tmp = tmp->b_cont;
3193 }
3194
3195 /*
3196 * If we got here, we ran out of mblk space before the
3197 * xid could be copied.
3198 */
3199 ASSERT(tmp == NULL && i < sizeof (xid));
3200
3201 RPCLOG0(1,
3202 "clnt_dispatch_notify: message less than size of xid\n");
3203 return (FALSE);
3204
3205 }
3206 done_xid_copy:
3207
3208 hash = call_hash(xid, clnt_cots_hash_size);
3209 chtp = &cots_call_ht[hash];
3210 /* call_table_find returns with the hash bucket locked */
3211 call_table_find(chtp, xid, e);
3212
3213 if (e != NULL) {
3214 /*
3215 * Found thread waiting for this reply
3216 */
3217 mutex_enter(&e->call_lock);
3218
3219 /*
3220 * verify that the reply is coming in on
3221 * the same zone that it was sent from.
3222 */
3223 if (e->call_zoneid != zoneid) {
3224 mutex_exit(&e->call_lock);
3225 mutex_exit(&chtp->ct_lock);
3226 RPCLOG0(1, "clnt_dispatch_notify: incorrect zoneid\n");
3227 return (FALSE);
3228 }
3229
3230 if (e->call_reply)
3231 /*
3232 * This can happen under the following scenario:
3233 * clnt_cots_kcallit() times out on the response,
3234 * rfscall() repeats the CLNT_CALL() with
3235 * the same xid, clnt_cots_kcallit() sends the retry,
3236 * thereby putting the clnt handle on the pending list,
3237 * the first response arrives, signalling the thread
3238 * in clnt_cots_kcallit(). Before that thread is
3239 * dispatched, the second response arrives as well,
3240 * and clnt_dispatch_notify still finds the handle on
3241 * the pending list, with call_reply set. So free the
3242 * old reply now.
3243 *
3244 * It is also possible for a response intended for
3245 * an RPC call with a different xid to reside here.
3246 * This can happen if the thread that owned this
3247 * client handle prior to the current owner bailed
3248 * out and left its call record on the dispatch
3249 * queue. A window exists where the response can
3250 * arrive before the current owner dispatches its
3251 * RPC call.
3252 *
3253 * In any case, this is the very last point where we
3254 * can safely check the call_reply field before
3255 * placing the new response there.
3256 */
3257 freemsg(e->call_reply);
3258 e->call_reply = mp;
3259 e->call_status = RPC_SUCCESS;
3260 e->call_notified = TRUE;
3261 cv_signal(&e->call_cv);
3262 mutex_exit(&e->call_lock);
3263 mutex_exit(&chtp->ct_lock);
3264 return (TRUE);
3265 } else {
3266 zone_t *zone;
3267 struct rpcstat *rpcstat;
3268
3269 mutex_exit(&chtp->ct_lock);
3270 RPCLOG(65, "clnt_dispatch_notify: no caller for reply 0x%x\n",
3271 xid);
3272 /*
3273 * This is unfortunate, but we need to lookup the zone so we
3274 * can increment its "rcbadxids" counter.
3275 */
3276 zone = zone_find_by_id(zoneid);
3277 if (zone == NULL) {
3278 /*
3279 * The zone went away...
3280 */
3281 return (FALSE);
3282 }
3283 rpcstat = zone_getspecific(rpcstat_zone_key, zone);
3284 if (zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN) {
3285 /*
3286 * Not interested
3287 */
3288 zone_rele(zone);
3289 return (FALSE);
3290 }
3291 COTSRCSTAT_INCR(rpcstat->rpc_cots_client, rcbadxids);
3292 zone_rele(zone);
3293 }
3294 return (FALSE);
3295 }
3296
3297 /*
3298 * Called by rpcmod when a non-data indication arrives. The ones in which we
3299 * are interested are connection indications and options acks. We dispatch
3300 * based on the queue the indication came in on. If we are not interested in
3301 * what came in, we return false to rpcmod, who will then pass it upstream.
3302 */
3303 bool_t
3304 clnt_dispatch_notifyconn(queue_t *q, mblk_t *mp)
3305 {
3306 calllist_t *e;
3307 int type;
3308
3309 ASSERT((q->q_flag & QREADR) == 0);
3310
3311 type = ((union T_primitives *)mp->b_rptr)->type;
3312 RPCLOG(8, "clnt_dispatch_notifyconn: prim type: [%s]\n",
3313 rpc_tpiprim2name(type));
3314 mutex_enter(&clnt_pending_lock);
3315 for (e = clnt_pending; /* NO CONDITION */; e = e->call_next) {
3316 if (e == NULL) {
3317 mutex_exit(&clnt_pending_lock);
3318 RPCLOG(1, "clnt_dispatch_notifyconn: no one waiting "
3319 "for connection on queue 0x%p\n", (void *)q);
3320 return (FALSE);
3321 }
3322 if (e->call_wq == q)
3323 break;
3324 }
3325
3326 switch (type) {
3327 case T_CONN_CON:
3328 /*
3329 * The transport is now connected, send a T_INFO_REQ to get
3330 * the tidu size.
3331 */
3332 mutex_exit(&clnt_pending_lock);
3333 ASSERT(mp->b_datap->db_lim - mp->b_datap->db_base >=
3334 sizeof (struct T_info_req));
3335 mp->b_rptr = mp->b_datap->db_base;
3336 ((union T_primitives *)mp->b_rptr)->type = T_INFO_REQ;
3337 mp->b_wptr = mp->b_rptr + sizeof (struct T_info_req);
3338 mp->b_datap->db_type = M_PCPROTO;
3339 put(q, mp);
3340 return (TRUE);
3341 case T_INFO_ACK:
3342 case T_OPTMGMT_ACK:
3343 e->call_status = RPC_SUCCESS;
3344 e->call_reply = mp;
3345 e->call_notified = TRUE;
3346 cv_signal(&e->call_cv);
3347 break;
3348 case T_ERROR_ACK:
3349 e->call_status = RPC_CANTCONNECT;
3350 e->call_reply = mp;
3351 e->call_notified = TRUE;
3352 cv_signal(&e->call_cv);
3353 break;
3354 case T_OK_ACK:
3355 /*
3356 * Great, but we are really waiting for a T_CONN_CON
3357 */
3358 freemsg(mp);
3359 break;
3360 default:
3361 mutex_exit(&clnt_pending_lock);
3362 RPCLOG(1, "clnt_dispatch_notifyconn: bad type %d\n", type);
3363 return (FALSE);
3364 }
3365
3366 mutex_exit(&clnt_pending_lock);
3367 return (TRUE);
3368 }
3369
3370 /*
3371 * Called by rpcmod when the transport is (or should be) going away. Informs
3372 * all callers waiting for replies and marks the entry in the connection
3373 * manager's list as unconnected, and either closing (close handshake in
3374 * progress) or dead.
3375 */
3376 void
3377 clnt_dispatch_notifyall(queue_t *q, int32_t msg_type, int32_t reason)
3378 {
3379 calllist_t *e;
3380 call_table_t *ctp;
3381 struct cm_xprt *cm_entry;
3382 int have_connmgr_lock;
3383 int i;
3384
3385 ASSERT((q->q_flag & QREADR) == 0);
3386
3387 RPCLOG(1, "clnt_dispatch_notifyall on queue %p", (void *)q);
3388 RPCLOG(1, " received a notifcation prim type [%s]",
3389 rpc_tpiprim2name(msg_type));
3390 RPCLOG(1, " and reason %d\n", reason);
3391
3392 /*
3393 * Find the transport entry in the connection manager's list, close
3394 * the transport and delete the entry. In the case where rpcmod's
3395 * idle timer goes off, it sends us a T_ORDREL_REQ, indicating we
3396 * should gracefully close the connection.
3397 */
3398 have_connmgr_lock = 1;
3399 mutex_enter(&connmgr_lock);
3400 for (cm_entry = cm_hd; cm_entry; cm_entry = cm_entry->x_next) {
3401 ASSERT(cm_entry != cm_entry->x_next);
3402 if (cm_entry->x_wq == q) {
3403 ASSERT(MUTEX_HELD(&connmgr_lock));
3404 ASSERT(have_connmgr_lock == 1);
3405 switch (msg_type) {
3406 case T_ORDREL_REQ:
3407
3408 if (cm_entry->x_dead) {
3409 RPCLOG(1, "idle timeout on dead "
3410 "connection: %p\n",
3411 (void *)cm_entry);
3412 if (clnt_stop_idle != NULL)
3413 (*clnt_stop_idle)(q);
3414 break;
3415 }
3416
3417 /*
3418 * Only mark the connection as dead if it is
3419 * connected and idle.
3420 * An unconnected connection has probably
3421 * gone idle because the server is down,
3422 * and when it comes back up there will be
3423 * retries that need to use that connection.
3424 */
3425 if (cm_entry->x_connected ||
3426 cm_entry->x_doomed) {
3427 if (cm_entry->x_ordrel) {
3428 if (cm_entry->x_closing ==
3429 TRUE) {
3430 /*
3431 * The connection is
3432 * obviously wedged due
3433 * to a bug or problem
3434 * with the transport.
3435 * Mark it as dead.
3436 * Otherwise we can
3437 * leak connections.
3438 */
3439 cm_entry->x_dead = TRUE;
3440 mutex_exit(
3441 &connmgr_lock);
3442 have_connmgr_lock = 0;
3443 if (clnt_stop_idle !=
3444 NULL)
3445 (*clnt_stop_idle)(q);
3446 break;
3447 }
3448 cm_entry->x_closing = TRUE;
3449 connmgr_sndrel(cm_entry);
3450 have_connmgr_lock = 0;
3451 } else {
3452 cm_entry->x_dead = TRUE;
3453 mutex_exit(&connmgr_lock);
3454 have_connmgr_lock = 0;
3455 if (clnt_stop_idle != NULL)
3456 (*clnt_stop_idle)(q);
3457 }
3458 } else {
3459 /*
3460 * We don't mark the connection
3461 * as dead, but we turn off the
3462 * idle timer.
3463 */
3464 mutex_exit(&connmgr_lock);
3465 have_connmgr_lock = 0;
3466 if (clnt_stop_idle != NULL)
3467 (*clnt_stop_idle)(q);
3468 RPCLOG(1, "clnt_dispatch_notifyall:"
3469 " ignoring timeout from rpcmod"
3470 " (q %p) because we are not "
3471 " connected\n", (void *)q);
3472 }
3473 break;
3474 case T_ORDREL_IND:
3475 /*
3476 * If this entry is marked closing, then we are
3477 * completing a close handshake, and the
3478 * connection is dead. Otherwise, the server is
3479 * trying to close. Since the server will not
3480 * be sending any more RPC replies, we abort
3481 * the connection, including flushing
3482 * any RPC requests that are in-transit.
3483 * In either case, mark the entry as dead so
3484 * that it can be closed by the connection
3485 * manager's garbage collector.
3486 */
3487 cm_entry->x_dead = TRUE;
3488 if (cm_entry->x_closing) {
3489 mutex_exit(&connmgr_lock);
3490 have_connmgr_lock = 0;
3491 if (clnt_stop_idle != NULL)
3492 (*clnt_stop_idle)(q);
3493 } else {
3494 /*
3495 * if we're getting a disconnect
3496 * before we've finished our
3497 * connect attempt, mark it for
3498 * later processing
3499 */
3500 if (cm_entry->x_thread)
3501 cm_entry->x_early_disc = TRUE;
3502 else
3503 cm_entry->x_connected = FALSE;
3504 cm_entry->x_waitdis = TRUE;
3505 connmgr_snddis(cm_entry);
3506 have_connmgr_lock = 0;
3507 }
3508 break;
3509
3510 case T_ERROR_ACK:
3511 case T_OK_ACK:
3512 cm_entry->x_waitdis = FALSE;
3513 cv_signal(&cm_entry->x_dis_cv);
3514 mutex_exit(&connmgr_lock);
3515 return;
3516
3517 case T_DISCON_REQ:
3518 if (cm_entry->x_thread)
3519 cm_entry->x_early_disc = TRUE;
3520 else
3521 cm_entry->x_connected = FALSE;
3522 cm_entry->x_waitdis = TRUE;
3523
3524 connmgr_snddis(cm_entry);
3525 have_connmgr_lock = 0;
3526 break;
3527
3528 case T_DISCON_IND:
3529 default:
3530 /*
3531 * if we're getting a disconnect before
3532 * we've finished our connect attempt,
3533 * mark it for later processing
3534 */
3535 if (cm_entry->x_closing) {
3536 cm_entry->x_dead = TRUE;
3537 mutex_exit(&connmgr_lock);
3538 have_connmgr_lock = 0;
3539 if (clnt_stop_idle != NULL)
3540 (*clnt_stop_idle)(q);
3541 } else {
3542 if (cm_entry->x_thread) {
3543 cm_entry->x_early_disc = TRUE;
3544 } else {
3545 cm_entry->x_dead = TRUE;
3546 cm_entry->x_connected = FALSE;
3547 }
3548 }
3549 break;
3550 }
3551 break;
3552 }
3553 }
3554
3555 if (have_connmgr_lock)
3556 mutex_exit(&connmgr_lock);
3557
3558 if (msg_type == T_ERROR_ACK || msg_type == T_OK_ACK) {
3559 RPCLOG(1, "clnt_dispatch_notifyall: (wq %p) could not find "
3560 "connmgr entry for discon ack\n", (void *)q);
3561 return;
3562 }
3563
3564 /*
3565 * Then kick all the clnt_pending calls out of their wait. There
3566 * should be no clnt_pending calls in the case of rpcmod's idle
3567 * timer firing.
3568 */
3569 for (i = 0; i < clnt_cots_hash_size; i++) {
3570 ctp = &cots_call_ht[i];
3571 mutex_enter(&ctp->ct_lock);
3572 for (e = ctp->ct_call_next;
3573 e != (calllist_t *)ctp;
3574 e = e->call_next) {
3575 if (e->call_wq == q && e->call_notified == FALSE) {
3576 RPCLOG(1,
3577 "clnt_dispatch_notifyall for queue %p ",
3578 (void *)q);
3579 RPCLOG(1, "aborting clnt_pending call %p\n",
3580 (void *)e);
3581
3582 if (msg_type == T_DISCON_IND)
3583 e->call_reason = reason;
3584 e->call_notified = TRUE;
3585 e->call_status = RPC_XPRTFAILED;
3586 cv_signal(&e->call_cv);
3587 }
3588 }
3589 mutex_exit(&ctp->ct_lock);
3590 }
3591
3592 mutex_enter(&clnt_pending_lock);
3593 for (e = clnt_pending; e; e = e->call_next) {
3594 /*
3595 * Only signal those RPC handles that haven't been
3596 * signalled yet. Otherwise we can get a bogus call_reason.
3597 * This can happen if thread A is making a call over a
3598 * connection. If the server is killed, it will cause
3599 * reset, and reason will default to EIO as a result of
3600 * a T_ORDREL_IND. Thread B then attempts to recreate
3601 * the connection but gets a T_DISCON_IND. If we set the
3602 * call_reason code for all threads, then if thread A
3603 * hasn't been dispatched yet, it will get the wrong
3604 * reason. The bogus call_reason can make it harder to
3605 * discriminate between calls that fail because the
3606 * connection attempt failed versus those where the call
3607 * may have been executed on the server.
3608 */
3609 if (e->call_wq == q && e->call_notified == FALSE) {
3610 RPCLOG(1, "clnt_dispatch_notifyall for queue %p ",
3611 (void *)q);
3612 RPCLOG(1, " aborting clnt_pending call %p\n",
3613 (void *)e);
3614
3615 if (msg_type == T_DISCON_IND)
3616 e->call_reason = reason;
3617 e->call_notified = TRUE;
3618 /*
3619 * Let the caller timeout, else it will retry
3620 * immediately.
3621 */
3622 e->call_status = RPC_XPRTFAILED;
3623
3624 /*
3625 * We used to just signal those threads
3626 * waiting for a connection, (call_xid = 0).
3627 * That meant that threads waiting for a response
3628 * waited till their timeout expired. This
3629 * could be a long time if they've specified a
3630 * maximum timeout. (2^31 - 1). So we
3631 * Signal all threads now.
3632 */
3633 cv_signal(&e->call_cv);
3634 }
3635 }
3636 mutex_exit(&clnt_pending_lock);
3637 }
3638
3639
3640 /*ARGSUSED*/
3641 /*
3642 * after resuming a system that's been suspended for longer than the
3643 * NFS server's idle timeout (svc_idle_timeout for Solaris 2), rfscall()
3644 * generates "NFS server X not responding" and "NFS server X ok" messages;
3645 * here we reset inet connections to cause a re-connect and avoid those
3646 * NFS messages. see 4045054
3647 */
3648 boolean_t
3649 connmgr_cpr_reset(void *arg, int code)
3650 {
3651 struct cm_xprt *cxp;
3652
3653 if (code == CB_CODE_CPR_CHKPT)
3654 return (B_TRUE);
3655
3656 if (mutex_tryenter(&connmgr_lock) == 0)
3657 return (B_FALSE);
3658 for (cxp = cm_hd; cxp; cxp = cxp->x_next) {
3659 if ((cxp->x_family == AF_INET || cxp->x_family == AF_INET6) &&
3660 cxp->x_connected == TRUE) {
3661 if (cxp->x_thread)
3662 cxp->x_early_disc = TRUE;
3663 else
3664 cxp->x_connected = FALSE;
3665 cxp->x_needdis = TRUE;
3666 }
3667 }
3668 mutex_exit(&connmgr_lock);
3669 return (B_TRUE);
3670 }
3671
3672 void
3673 clnt_cots_stats_init(zoneid_t zoneid, struct rpc_cots_client **statsp)
3674 {
3675
3676 *statsp = (struct rpc_cots_client *)rpcstat_zone_init_common(zoneid,
3677 "unix", "rpc_cots_client", (const kstat_named_t *)&cots_rcstat_tmpl,
3678 sizeof (cots_rcstat_tmpl));
3679 }
3680
3681 void
3682 clnt_cots_stats_fini(zoneid_t zoneid, struct rpc_cots_client **statsp)
3683 {
3684 rpcstat_zone_fini_common(zoneid, "unix", "rpc_cots_client");
3685 kmem_free(*statsp, sizeof (cots_rcstat_tmpl));
3686 }
3687
3688 void
3689 clnt_cots_init(void)
3690 {
3691 mutex_init(&connmgr_lock, NULL, MUTEX_DEFAULT, NULL);
3692 mutex_init(&clnt_pending_lock, NULL, MUTEX_DEFAULT, NULL);
3693
3694 if (clnt_cots_hash_size < DEFAULT_MIN_HASH_SIZE)
3695 clnt_cots_hash_size = DEFAULT_MIN_HASH_SIZE;
3696
3697 cots_call_ht = call_table_init(clnt_cots_hash_size);
3698 zone_key_create(&zone_cots_key, NULL, NULL, clnt_zone_destroy);
3699 }
3700
3701 void
3702 clnt_cots_fini(void)
3703 {
3704 (void) zone_key_delete(zone_cots_key);
3705 }
3706
3707 /*
3708 * Wait for TPI ack, returns success only if expected ack is received
3709 * within timeout period.
3710 */
3711
3712 static int
3713 waitforack(calllist_t *e, t_scalar_t ack_prim, const struct timeval *waitp,
3714 bool_t nosignal)
3715 {
3716 union T_primitives *tpr;
3717 clock_t timout;
3718 int cv_stat = 1;
3719
3720 ASSERT(MUTEX_HELD(&clnt_pending_lock));
3721 while (e->call_reply == NULL) {
3722 if (waitp != NULL) {
3723 timout = waitp->tv_sec * drv_usectohz(MICROSEC) +
3724 drv_usectohz(waitp->tv_usec);
3725 if (nosignal)
3726 cv_stat = cv_reltimedwait(&e->call_cv,
3727 &clnt_pending_lock, timout, TR_CLOCK_TICK);
3728 else
3729 cv_stat = cv_reltimedwait_sig(&e->call_cv,
3730 &clnt_pending_lock, timout, TR_CLOCK_TICK);
3731 } else {
3732 if (nosignal)
3733 cv_wait(&e->call_cv, &clnt_pending_lock);
3734 else
3735 cv_stat = cv_wait_sig(&e->call_cv,
3736 &clnt_pending_lock);
3737 }
3738 if (cv_stat == -1)
3739 return (ETIME);
3740 if (cv_stat == 0)
3741 return (EINTR);
3742 /*
3743 * if we received an error from the server and we know a reply
3744 * is not going to be sent, do not wait for the full timeout,
3745 * return now.
3746 */
3747 if (e->call_status == RPC_XPRTFAILED)
3748 return (e->call_reason);
3749 }
3750 tpr = (union T_primitives *)e->call_reply->b_rptr;
3751 if (tpr->type == ack_prim)
3752 return (0); /* Success */
3753
3754 if (tpr->type == T_ERROR_ACK) {
3755 if (tpr->error_ack.TLI_error == TSYSERR)
3756 return (tpr->error_ack.UNIX_error);
3757 else
3758 return (t_tlitosyserr(tpr->error_ack.TLI_error));
3759 }
3760
3761 return (EPROTO); /* unknown or unexpected primitive */
3762 }