Print this page
5218 posix definition of NULL
correct unistd.h and iso/stddef_iso.h
update gate source affected
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/cmd/lvm/rpc.mdcommd/mdmn_commd_server.c
+++ new/usr/src/cmd/lvm/rpc.mdcommd/mdmn_commd_server.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
24 24 */
25 25
26 26 #include <unistd.h>
27 27 #include <sys/types.h>
28 28 #include <sys/stat.h>
29 29 #include <sys/statvfs.h>
30 30 #include <sys/uadmin.h>
31 31 #include <sys/resource.h>
32 32 #include <fcntl.h>
33 33 #include <stdio.h>
34 34 #include <thread.h>
35 35 #include <meta.h>
36 36 #include <sdssc.h>
37 37 #include <mdmn_changelog.h>
38 38 #include "mdmn_subr.h"
39 39
40 40 /*
41 41 * This is the communication daemon for SVM Multi Node Disksets.
42 42 * It runs on every node and provides the following rpc services:
43 43 * - mdmn_send_svc_2
44 44 * - mdmn_work_svc_2
45 45 * - mdmn_wakeup_initiator_svc_2
46 46 * - mdmn_wakeup_master_svc_2
47 47 * - mdmn_comm_lock_svc_2
48 48 * - mdmn_comm_unlock_svc_2
49 49 * - mdmn_comm_suspend_svc_2
50 50 * - mdmn_comm_resume_svc_2
51 51 * - mdmn_comm_reinit_set_svc_2
52 52 * where send, lock, unlock and reinit are meant for external use,
53 53 * work and the two wakeups are for internal use only.
54 54 *
55 55 * NOTE:
56 56 * On every node only one of those xxx_2 functions can be active at the
57 57 * same time because the daemon is single threaded.
58 58 *
59 59 * (not quite true, as mdmn_send_svc_2 and mdmn_work_svc_2 do thr_create()s
60 60 * as part of their handlers, so those aspects are multi-threaded)
61 61 *
62 62 * In case an event occurs that has to be propagated to all the nodes...
63 63 *
64 64 * One node (the initiator)
65 65 * calls the libmeta function mdmn_send_message()
66 66 * This function calls the local daemon thru mdmn_send_svc_2.
67 67 *
68 68 * On the initiator:
69 69 * mdmn_send_svc_2()
70 70 * - starts a thread -> mdmn_send_to_work() and returns.
71 71 * mdmn_send_to_work()
72 72 * - sends this message over to the master of the diskset.
73 73 * This is done by calling mdmn_work_svc_2 on the master.
74 74 * - registers to the initiator_table
75 75 * - exits without doing a svc_sendreply() for the call to
76 76 * mdmn_send_svc_2. This means that call is blocked until somebody
77 77 * (see end of this comment) does a svc_sendreply().
78 78 * This means mdmn_send_message() does not yet return.
79 79 * - A timeout surveillance is started at this point.
80 80 * This means in case the master doesn't reply at all in an
81 81 * aproppriate time, an error condition is returned
82 82 * to the caller.
83 83 *
84 84 * On the master:
85 85 * mdmn_work_svc_2()
86 86 * - starts a thread -> mdmn_master_process_msg() and returns
87 87 * mdmn_master_process_msg()
88 88 * - logs the message to the change log
89 89 * - executes the message locally
90 90 * - flags the message in the change log
91 91 * - sends the message to mdmn_work_svc_2() on all the
92 92 * other nodes (slaves)
93 93 * after each call to mdmn_work_svc_2 the thread goes to sleep and
94 94 * will be woken up by mdmn_wakeup_master_svc_2() as soon as the
95 95 * slave node is done with this message.
96 96 * - In case the slave doesn't respond in a apropriate time, an error
97 97 * is assumed to ensure the master doesn't wait forever.
98 98 *
99 99 * On a slave:
100 100 * mdmn_work_svc_2()
101 101 * - starts a thread -> mdmn_slave_process_msg() and returns
102 102 * mdmn_slave_process_msg()
103 103 * - processes this message locally by calling the appropriate message
104 104 * handler, that creates some result.
105 105 * - sends that result thru a call to mdmn_wakeup_master_svc_2() to
106 106 * the master.
107 107 *
108 108 * Back on the master:
109 109 * mdmn_wakeup_master_svc_2()
110 110 * - stores the result into the master_table.
111 111 * - signals the mdmn_master_process_msg-thread.
112 112 * - returns
113 113 * mdmn_master_process_msg()
114 114 * - after getting the results from all nodes
115 115 * - sends them back to the initiating node thru a call to
116 116 * mdmn_wakeup_initiator_svc_2.
117 117 *
118 118 * Back on the initiator:
119 119 * mdmn_wakeup_initiator_svc_2()
120 120 * - calls svc_sendreply() which makes the call to mdmn_send_svc_2()
121 121 * return.
122 122 * which allows the initial mdmn_send_message() call to return.
123 123 */
124 124
125 125 FILE *commdout; /* debug output for the commd */
126 126 char *commdoutfile; /* file name for the above output */
127 127 /* want at least 10 MB free space when logging into a file */
128 128 #define MIN_FS_SPACE (10LL * 1024 * 1024)
129 129
130 130 /*
131 131 * Number of outstanding messages that were initiated by this node.
132 132 * If zero, check_timeouts goes to sleep
133 133 */
134 134 uint_t messages_on_their_way;
135 135 mutex_t check_timeout_mutex; /* need mutex to protect above */
136 136 cond_t check_timeout_cv; /* trigger for check_timeouts */
137 137
138 138 /* for printing out time stamps */
139 139 hrtime_t __savetime;
140 140
141 141 /* RPC clients for every set and every node and their protecting locks */
142 142 CLIENT *client[MD_MAXSETS][NNODES];
143 143 rwlock_t client_rwlock[MD_MAXSETS];
144 144
145 145 /* the descriptors of all possible sets and their protectors */
146 146 struct md_set_desc *set_descriptor[MD_MAXSETS];
147 147 rwlock_t set_desc_rwlock[MD_MAXSETS];
148 148
149 149 /* the daemon to daemon communication has to timeout quickly */
150 150 static struct timeval FOUR_SECS = { 4, 0 };
151 151
152 152 /* These indicate if a set has already been setup */
153 153 int md_mn_set_inited[MD_MAXSETS];
154 154
155 155 /* For every set we have a message completion table and protecting mutexes */
156 156 md_mn_mct_t *mct[MD_MAXSETS];
157 157 mutex_t mct_mutex[MD_MAXSETS][MD_MN_NCLASSES];
158 158
159 159 /* Stuff to describe the global status of the commd on one node */
160 160 #define MD_CGS_INITED 0x0001
161 161 #define MD_CGS_ABORTED 0x0002 /* return everything with MDMNE_ABORT */
162 162 uint_t md_commd_global_state = 0; /* No state when starting up */
163 163
164 164 /*
165 165 * Global verbosity level for the daemon
166 166 */
167 167 uint_t md_commd_global_verb;
168 168
169 169 /*
170 170 * libmeta doesn't like multiple threads in metaget_setdesc().
171 171 * So we must protect access to it with a global lock
172 172 */
173 173 mutex_t get_setdesc_mutex;
174 174
175 175 /*
176 176 * Need a way to block single message types,
177 177 * hence an array with a status for every message type
178 178 */
179 179 uint_t msgtype_lock_state[MD_MN_NMESSAGES];
180 180
181 181 /* for reading in the config file */
182 182 #define MAX_LINE_SIZE 1024
183 183
184 184 extern char *commd_get_outfile(void);
185 185 extern uint_t commd_get_verbosity(void);
186 186
187 187 /*
188 188 * mdmn_clnt_create is a helper function for meta_client_create_retry. It
189 189 * merely needs to call clnt_create_timed, and meta_client_create_retry
190 190 * will take care of the rest.
191 191 */
192 192 /* ARGSUSED */
193 193 static CLIENT *
194 194 mdmn_clnt_create(char *ignore, void *data, struct timeval *time_out)
195 195 {
196 196 md_mnnode_desc *node = (md_mnnode_desc *)data;
197 197
198 198 return (clnt_create_timed(node->nd_priv_ic, MDMN_COMMD, TWO, "tcp",
199 199 time_out));
200 200 }
201 201
202 202 #define FLUSH_DEBUGFILE() \
203 203 if (commdout != (FILE *)NULL) { \
204 204 (void) fflush(commdout); \
205 205 (void) fsync(fileno(commdout)); \
206 206 }
207 207
208 208 static void
209 209 panic_system(int nid, md_mn_msgtype_t type, int master_err, int master_exitval,
210 210 md_mn_result_t *slave_result)
211 211 {
212 212 md_mn_commd_err_t commd_err;
213 213 md_error_t mne = mdnullerror;
214 214 char *msg_buf;
215 215
216 216 msg_buf = (char *)calloc(MAXPATHLEN + 1, sizeof (char));
217 217
218 218 FLUSH_DEBUGFILE();
219 219
220 220 if (master_err != MDMNE_ACK) {
221 221 (void) snprintf(msg_buf, MAXPATHLEN, "rpc.mdcommd: RPC "
222 222 "fail on master when processing message type %d\n", type);
223 223 } else if (slave_result == NULL) {
224 224 (void) snprintf(msg_buf, MAXPATHLEN, "rpc.mdcommd: RPC fail "
225 225 "on node %d when processing message type %d\n", nid, type);
226 226 } else {
↓ open down ↓ |
226 lines elided |
↑ open up ↑ |
227 227 (void) snprintf(msg_buf, MAXPATHLEN, "rpc.mdcommd: "
228 228 "Inconsistent return value from node %d when processing "
229 229 "message type %d. Master exitval = %d, "
230 230 "Slave exitval = %d\n", nid, type, master_exitval,
231 231 slave_result->mmr_exitval);
232 232 }
233 233 commd_err.size = strlen(msg_buf);
234 234 commd_err.md_message = (uint64_t)(uintptr_t)&msg_buf[0];
235 235
236 236 (void) metaioctl(MD_MN_COMMD_ERR, &commd_err, &mne, "rpc.mdcommd");
237 - (void) uadmin(A_DUMP, AD_BOOT, NULL);
237 + (void) uadmin(A_DUMP, AD_BOOT, (uintptr_t)NULL);
238 238 }
239 239
240 240 static void
241 241 flush_fcout()
242 242 {
243 243 struct statvfs64 vfsbuf;
244 244 long long avail_bytes;
245 245 int warned = 0;
246 246
247 247 for (; ; ) {
248 248 (void) sleep(10);
249 249 /* No output file, nothing to do */
250 250 if (commdout == (FILE *)NULL)
251 251 continue;
252 252
253 253 /*
254 254 * stat the appropriate filesystem to check for available space.
255 255 */
256 256 if (statvfs64(commdoutfile, &vfsbuf)) {
257 257 continue;
258 258 }
259 259
260 260 avail_bytes = vfsbuf.f_frsize * vfsbuf.f_bavail;
261 261 /*
262 262 * If we don't have enough space, we print out a warning.
263 263 * And we drop the verbosity level to NULL
264 264 * In case the condtion doesn't go away, we don't repeat
265 265 * the warning.
266 266 */
267 267 if (avail_bytes < MIN_FS_SPACE) {
268 268 if (warned) {
269 269 continue;
270 270 }
271 271 commd_debug(MD_MMV_SYSLOG,
272 272 "NOT enough space available for logging\n");
273 273 commd_debug(MD_MMV_SYSLOG,
274 274 "Have %lld bytes, need %lld bytes\n",
275 275 avail_bytes, MIN_FS_SPACE);
276 276 warned = 1;
277 277 md_commd_global_verb = MD_MMV_NULL;
278 278 } else {
279 279 warned = 0;
280 280 }
281 281
282 282 (void) fflush(commdout);
283 283 }
284 284 }
285 285
286 286 /* safer version of clnt_destroy. If clnt is NULL don't do anything */
287 287 #define mdmn_clnt_destroy(clnt) { \
288 288 if (clnt) \
289 289 clnt_destroy(clnt); \
290 290 }
291 291
292 292 /*
293 293 * Own version of svc_sendreply that checks the integrity of the transport
294 294 * handle and so prevents us from core dumps in the real svc_sendreply()
295 295 */
296 296 void
297 297 mdmn_svc_sendreply(SVCXPRT *transp, xdrproc_t xdr, caddr_t data)
298 298 {
299 299 if (SVC_STAT(transp) == XPRT_DIED) {
300 300 commd_debug(MD_MMV_MISC,
301 301 "mdmn_svc_sendreply: XPRT_DIED\n");
302 302 return;
303 303 }
304 304 (void) svc_sendreply(transp, xdr, data);
305 305 }
306 306
307 307 /*
308 308 * timeout_initiator(set, class)
309 309 *
310 310 * Alas, I sent a message and didn't get a response back in aproppriate time.
311 311 *
312 312 * timeout_initiator() takes care for doing the needed svc_sendreply() to the
313 313 * calling mdmn_send_message, so that guy doesn't wait forever
314 314 * What is done here is pretty much the same as what is done in
315 315 * wakeup initiator. The difference is that we cannot provide for any results,
316 316 * of course and we set the comm_state to MDMNE_TIMEOUT.
317 317 *
318 318 * By doing so, mdmn_send_message can decide if a retry would make sense or not.
319 319 * It's not our's to decide that here.
320 320 */
321 321 void
322 322 timeout_initiator(set_t setno, md_mn_msgclass_t class)
323 323 {
324 324 SVCXPRT *transp;
325 325 md_mn_msgid_t mid;
326 326 md_mn_result_t *resultp;
327 327
328 328 resultp = Zalloc(sizeof (md_mn_result_t));
329 329 resultp->mmr_comm_state = MDMNE_TIMEOUT;
330 330
331 331 commd_debug(MD_MMV_MISC,
332 332 "timeout_initiator set = %d, class = %d\n", setno, class);
333 333
334 334 transp = mdmn_get_initiator_table_transp(setno, class);
335 335 mdmn_get_initiator_table_id(setno, class, &mid);
336 336
337 337 commd_debug(MD_MMV_MISC, "timeout_ini: (%d, 0x%llx-%d)\n",
338 338 MSGID_ELEMS(mid));
339 339 /*
340 340 * Give the result the corresponding msgid from the failed message.
341 341 */
342 342 MSGID_COPY(&mid, &(resultp->mmr_msgid));
343 343
344 344 /* return to mdmn_send_message() and let it deal with the situation */
345 345 mdmn_svc_sendreply(transp, xdr_md_mn_result_t, (char *)resultp);
346 346
347 347 free(resultp);
348 348 commd_debug(MD_MMV_MISC, "timeout_ini: sendreplied\n");
349 349 svc_done(transp);
350 350 mdmn_unregister_initiator_table(setno, class);
351 351 }
352 352
353 353
354 354 /*
355 355 * check_timeouts - thread
356 356 *
357 357 * This implements a timeout surveillance for messages sent from the
358 358 * initiator to the master.
359 359 *
360 360 * If a message is started, this thread is triggered thru
361 361 * cond_signal(&check_timeout_cv) and we keep track of the numbers of
362 362 * messages that are outstanding (messages_on_their_way).
363 363 *
364 364 * As long as there are messages on their way, this thread never goes to sleep.
365 365 * It'll keep checking all class/set combinations for outstanding messages.
366 366 * If one is found, it's checked if this message is overdue. In that case,
367 367 * timeout_initiator() is called to wakeup the calling mdmn_send_message and
368 368 * to clean up the mess.
369 369 *
370 370 * If the result from the master arrives later, this message is considered
371 371 * to be unsolicited. And will be ignored.
372 372 */
373 373
374 374 void
375 375 check_timeouts()
376 376 {
377 377 set_t setno;
378 378 time_t now, then;
379 379 mutex_t *mx;
380 380 md_mn_msgclass_t class;
381 381
382 382 for (; ; ) {
383 383 now = time((time_t *)NULL);
384 384 for (setno = 1; setno < MD_MAXSETS; setno++) {
385 385 if (md_mn_set_inited[setno] != MDMN_SET_READY) {
386 386 continue;
387 387 }
388 388 for (class = MD_MSG_CLASS1; class < MD_MN_NCLASSES;
389 389 class++) {
390 390 mx = mdmn_get_initiator_table_mx(setno, class);
391 391 (void) mutex_lock(mx);
392 392
393 393 /* then is the registered time */
394 394 then =
395 395 mdmn_get_initiator_table_time(setno, class);
396 396 if ((then != 0) && (now > then)) {
397 397 timeout_initiator(setno, class);
398 398 }
399 399 (void) mutex_unlock(mx);
400 400 }
401 401 }
402 402 /* it's ok to check only once per second */
403 403 (void) sleep(1);
404 404
405 405 /* is there work to do? */
406 406 (void) mutex_lock(&check_timeout_mutex);
407 407 if (messages_on_their_way == 0) {
408 408 (void) cond_wait(&check_timeout_cv,
409 409 &check_timeout_mutex);
410 410 }
411 411 (void) mutex_unlock(&check_timeout_mutex);
412 412 }
413 413 }
414 414
415 415 void
416 416 setup_debug(void)
417 417 {
418 418 char *tmp_dir;
419 419
420 420 /* Read in the debug-controlling tokens from runtime.cf */
421 421 md_commd_global_verb = commd_get_verbosity();
422 422 /*
423 423 * If the user didn't specify a verbosity level in runtime.cf
424 424 * we can safely return here. As we don't intend to printout
425 425 * debug messages, we don't need to check for the output file.
426 426 */
427 427 if (md_commd_global_verb == 0) {
428 428 return;
429 429 }
430 430
431 431 /* if commdout is non-NULL it is an open FILE, we'd better close it */
432 432 if (commdout != (FILE *)NULL) {
433 433 (void) fclose(commdout);
434 434 }
435 435
436 436 commdoutfile = commd_get_outfile();
437 437
438 438 /* setup the debug output */
439 439 if (commdoutfile == (char *)NULL) {
440 440 /* if no valid file was specified, use the default */
441 441 commdoutfile = "/var/run/commd.out";
442 442 commdout = fopen(commdoutfile, "a");
443 443 } else {
444 444 /* check if the directory exists and is writable */
445 445 tmp_dir = strdup(commdoutfile);
446 446 if ((access(dirname(tmp_dir), X_OK|W_OK)) ||
447 447 ((commdout = fopen(commdoutfile, "a")) == (FILE *)NULL)) {
448 448 syslog(LOG_ERR,
449 449 "Can't write to specified output file %s,\n"
450 450 "using /var/run/commd.out instead\n", commdoutfile);
451 451 free(commdoutfile);
452 452 commdoutfile = "/var/run/commd.out";
453 453 commdout = fopen(commdoutfile, "a");
454 454 }
455 455 free(tmp_dir);
456 456 }
457 457
458 458 if (commdout == (FILE *)NULL) {
459 459 syslog(LOG_ERR, "Can't write to debug output file %s\n",
460 460 commdoutfile);
461 461 }
462 462 }
463 463
464 464 /*
465 465 * mdmn_is_node_dead checks to see if a node is dead using
466 466 * the SunCluster infrastructure which is a stable interface.
467 467 * If unable to contact SunCuster the node is assumed to be alive.
468 468 * Return values:
469 469 * 1 - node is dead
470 470 * 0 - node is alive
471 471 */
472 472 int
473 473 mdmn_is_node_dead(md_mnnode_desc *node)
474 474 {
475 475 char *fmt = "/usr/cluster/bin/scha_cluster_get -O NODESTATE_NODE ";
476 476 char *cmd;
477 477 size_t size;
478 478 char buf[10];
479 479 FILE *ptr;
480 480 int retval = 0;
481 481
482 482 /* I know that I'm alive */
483 483 if (strcmp(node->nd_nodename, mynode()) == 0)
484 484 return (retval);
485 485
486 486 size = strlen(fmt) + strlen(node->nd_nodename) + 1;
487 487 cmd = Zalloc(size);
488 488 (void) strlcat(cmd, fmt, size);
489 489 (void) strlcat(cmd, node->nd_nodename, size);
490 490
491 491 if ((ptr = popen(cmd, "r")) != NULL) {
492 492 if (fgets(buf, sizeof (buf), ptr) != NULL) {
493 493 /* If scha_cluster_get returned DOWN - return dead */
494 494 if (strncmp(buf, "DOWN", 4) == 0)
495 495 retval = 1;
496 496 }
497 497 (void) pclose(ptr);
498 498 }
499 499 Free(cmd);
500 500 return (retval);
501 501 }
502 502
503 503 /*
504 504 * global_init()
505 505 *
506 506 * Perform some global initializations.
507 507 *
508 508 * the following routines have to call this before operation can start:
509 509 * - mdmn_send_svc_2
510 510 * - mdmn_work_svc_2
511 511 * - mdmn_comm_lock_svc_2
512 512 * - mdmn_comm_unlock_svc_2
513 513 * - mdmn_comm_suspend_svc_2
514 514 * - mdmn_comm_resume_svc_2
515 515 * - mdmn_comm_reinit_set_svc_2
516 516 *
517 517 * This is a single threaded daemon, so it can only be in one of the above
518 518 * routines at the same time.
519 519 * This means, global_init() cannot be called more than once at the same time.
520 520 * Hence, no lock is needed.
521 521 */
522 522 void
523 523 global_init(void)
524 524 {
525 525 set_t set;
526 526 md_mn_msgclass_t class;
527 527 struct sigaction sighandler;
528 528 time_t clock_val;
529 529 struct rlimit commd_limit;
530 530
531 531
532 532
533 533 /* Do these global initializations only once */
534 534 if (md_commd_global_state & MD_CGS_INITED) {
535 535 return;
536 536 }
537 537 (void) sdssc_bind_library();
538 538
539 539 /* setup the debug options from the config file */
540 540 setup_debug();
541 541
542 542 /* make sure that we don't run out of file descriptors */
543 543 commd_limit.rlim_cur = commd_limit.rlim_max = RLIM_INFINITY;
544 544 if (setrlimit(RLIMIT_NOFILE, &commd_limit) != 0) {
545 545 syslog(LOG_WARNING, gettext("setrlimit failed."
546 546 "Could not increase the max file descriptors"));
547 547 }
548 548
549 549 /* Make setup_debug() be the action in case of SIGHUP */
550 550 sighandler.sa_flags = 0;
551 551 (void) sigfillset(&sighandler.sa_mask);
552 552 sighandler.sa_handler = (void (*)(int)) setup_debug;
553 553 (void) sigaction(SIGHUP, &sighandler, NULL);
554 554
555 555 __savetime = gethrtime();
556 556 (void) time(&clock_val);
557 557 commd_debug(MD_MMV_MISC, "global init called %s\n", ctime(&clock_val));
558 558
559 559 /* start a thread that flushes out the debug on a regular basis */
560 560 (void) thr_create(NULL, 0, (void *(*)(void *))flush_fcout,
561 561 (void *) NULL, THR_DETACHED, NULL);
562 562
563 563 /* global rwlock's / mutex's / cond_t's go here */
564 564 (void) mutex_init(&check_timeout_mutex, USYNC_THREAD, NULL);
565 565 (void) cond_init(&check_timeout_cv, USYNC_THREAD, NULL);
566 566 (void) mutex_init(&get_setdesc_mutex, USYNC_THREAD, NULL);
567 567
568 568 /* Make sure the initiator table is initialized correctly */
569 569 for (set = 0; set < MD_MAXSETS; set++) {
570 570 for (class = 0; class < MD_MN_NCLASSES; class++) {
571 571 mdmn_unregister_initiator_table(set, class);
572 572 }
573 573 }
574 574
575 575
576 576 /* setup the check for timeouts */
577 577 (void) thr_create(NULL, 0, (void *(*)(void *))check_timeouts,
578 578 (void *) NULL, THR_DETACHED, NULL);
579 579
580 580 md_commd_global_state |= MD_CGS_INITED;
581 581 }
582 582
583 583
584 584 /*
585 585 * mdmn_init_client(setno, nodeid)
586 586 * called if client[setno][nodeid] is NULL
587 587 *
588 588 * NOTE: Must be called with set_desc_rwlock held as a reader
589 589 * NOTE: Must be called with client_rwlock held as a writer
590 590 *
591 591 * If the rpc client for this node has not been setup for any set, we do it now.
592 592 *
593 593 * Returns 0 on success (node found in set, rpc client setup)
594 594 * -1 if metaget_setdesc failed,
595 595 * -2 if node not part of set
596 596 * -3 if clnt_create fails
597 597 */
598 598 static int
599 599 mdmn_init_client(set_t setno, md_mn_nodeid_t nid)
600 600 {
601 601 md_error_t ep = mdnullerror;
602 602 md_mnnode_desc *node;
603 603 md_set_desc *sd; /* just an abbr for set_descriptor[setno] */
604 604
605 605 sd = set_descriptor[setno];
606 606
607 607 /*
608 608 * Is the appropriate set_descriptor already initialized ?
609 609 * Can't think of a scenario where this is not the case, but we'd better
610 610 * check for it anyway.
611 611 */
612 612 if (sd == NULL) {
613 613 mdsetname_t *sp;
614 614
615 615 /* readlock -> writelock */
616 616 (void) rw_unlock(&set_desc_rwlock[setno]);
617 617 (void) rw_wrlock(&set_desc_rwlock[setno]);
618 618 sp = metasetnosetname(setno, &ep);
619 619 /* Only one thread is supposed to be in metaget_setdesc() */
620 620 (void) mutex_lock(&get_setdesc_mutex);
621 621 sd = metaget_setdesc(sp, &ep);
622 622 (void) mutex_unlock(&get_setdesc_mutex);
623 623 if (sd == NULL) {
624 624 /* back to ... */
625 625 (void) rw_unlock(&set_desc_rwlock[setno]);
626 626 /* ... readlock */
627 627 (void) rw_rdlock(&set_desc_rwlock[setno]);
628 628 return (-1);
629 629 }
630 630 set_descriptor[setno] = sd;
631 631 /* back to readlock */
632 632 (void) rw_unlock(&set_desc_rwlock[setno]);
633 633 (void) rw_rdlock(&set_desc_rwlock[setno]);
634 634 }
635 635
636 636 /* first we have to find the node name for this node id */
637 637 for (node = sd->sd_nodelist; node; node = node->nd_next) {
638 638 if (node->nd_nodeid == nid)
639 639 break; /* we found our node in this set */
640 640 }
641 641
642 642
643 643 if (node == (md_mnnode_desc *)NULL) {
644 644 commd_debug(MD_MMV_SYSLOG,
645 645 "FATAL: node %d not found in set %d\n", nid, setno);
646 646 (void) rw_unlock(&set_desc_rwlock[setno]);
647 647 return (-2);
648 648 }
649 649
650 650 commd_debug(MD_MMV_INIT, "init: %s has the flags: 0x%x\n",
651 651 node->nd_nodename ? node->nd_nodename : "NULL", node->nd_flags);
652 652
653 653 /* Did this node join the diskset? */
654 654 if ((node->nd_flags & MD_MN_NODE_OWN) == 0) {
655 655 commd_debug(MD_MMV_INIT, "init: %s didn't join set %d\n",
656 656 node->nd_nodename ? node->nd_nodename : "NULL", setno);
657 657 (void) rw_unlock(&set_desc_rwlock[setno]);
658 658 return (-2);
659 659 }
660 660
661 661 /* if clnt_create has not been done for that node, do it now */
662 662 if (client[setno][nid] == (CLIENT *) NULL) {
663 663 time_t tout = 0;
664 664
665 665 /*
666 666 * While trying to create a connection to a node,
667 667 * periodically check to see if the node has been marked
668 668 * dead by the SunCluster infrastructure.
669 669 * This periodic check is needed since a non-responsive
670 670 * rpc.mdcommd (while it is attempting to create a connection
671 671 * to a dead node) can lead to large delays and/or failures
672 672 * in the reconfig steps.
673 673 */
674 674 while ((client[setno][nid] == (CLIENT *) NULL) &&
675 675 (tout < MD_CLNT_CREATE_TOUT)) {
676 676 client[setno][nid] = meta_client_create_retry(
677 677 node->nd_nodename, mdmn_clnt_create,
678 678 (void *) node, MD_CLNT_CREATE_SUBTIMEOUT, &ep);
679 679 /* Is the node dead? */
680 680 if (mdmn_is_node_dead(node) == 1) {
681 681 commd_debug(MD_MMV_SYSLOG,
682 682 "rpc.mdcommd: no client for dead node %s\n",
683 683 node->nd_nodename);
684 684 break;
685 685 } else
686 686 tout += MD_CLNT_CREATE_SUBTIMEOUT;
687 687 }
688 688
689 689 if (client[setno][nid] == (CLIENT *) NULL) {
690 690 clnt_pcreateerror(node->nd_nodename);
691 691 (void) rw_unlock(&set_desc_rwlock[setno]);
692 692 return (-3);
693 693 }
694 694 /* this node has the license to send */
695 695 commd_debug(MD_MMV_MISC, "init_client: calling add_lic\n");
696 696 add_license(node);
697 697
698 698 /* set the timeout value */
699 699 clnt_control(client[setno][nid], CLSET_TIMEOUT,
700 700 (char *)&FOUR_SECS);
701 701
702 702 }
703 703 (void) rw_unlock(&set_desc_rwlock[setno]);
704 704 return (0);
705 705 }
706 706
707 707 /*
708 708 * check_client(setno, nodeid)
709 709 *
710 710 * must be called with reader lock held for set_desc_rwlock[setno]
711 711 * and must be called with reader lock held for client_rwlock[setno]
712 712 * Checks if the client for this set/node combination is already setup
713 713 * if not it upgrades the lock to a writer lock
714 714 * and tries to initialize the client.
715 715 * Finally it's checked if the client nulled out again due to some race
716 716 *
717 717 * returns 0 if there is a usable client
718 718 * returns MDMNE_RPC_FAIL otherwise
719 719 */
720 720 static int
721 721 check_client(set_t setno, md_mn_nodeid_t nodeid)
722 722 {
723 723 int ret = 0;
724 724
725 725 while ((client[setno][nodeid] == (CLIENT *)NULL) && (ret == 0)) {
726 726 /* upgrade reader ... */
727 727 (void) rw_unlock(&client_rwlock[setno]);
728 728 /* ... to writer lock. */
729 729 (void) rw_wrlock(&client_rwlock[setno]);
730 730 if (mdmn_init_client(setno, nodeid) != 0) {
731 731 ret = MDMNE_RPC_FAIL;
732 732 }
733 733 /* downgrade writer ... */
734 734 (void) rw_unlock(&client_rwlock[setno]);
735 735 /* ... back to reader lock. */
736 736 (void) rw_rdlock(&client_rwlock[setno]);
737 737 }
738 738 return (ret);
739 739 }
740 740
741 741 /*
742 742 * mdmn_init_set(setno, todo)
743 743 * setno is the number of the set to be initialized.
744 744 * todo is one of the MDMN_SET_* thingies or MDMN_SET_READY
745 745 * If called with MDMN_SET_READY everything is initialized.
746 746 *
747 747 * If the set mutexes are already initialized, the caller has to hold
748 748 * both set_desc_rwlock[setno] and client_rwlock[setno] as a writer, before
749 749 * calling mdmn_init_set()
750 750 */
751 751 int
752 752 mdmn_init_set(set_t setno, int todo)
753 753 {
754 754 int class;
755 755 md_mnnode_desc *node;
756 756 md_set_desc *sd; /* just an abbr for set_descriptor[setno] */
757 757 mdsetname_t *sp;
758 758 md_error_t ep = mdnullerror;
759 759 md_mn_nodeid_t nid;
760 760
761 761 /*
762 762 * Check if we are told to setup the mutexes and
763 763 * if these are not yet setup
764 764 */
765 765 if ((todo & MDMN_SET_MUTEXES) &&
766 766 ((md_mn_set_inited[setno] & MDMN_SET_MUTEXES) == 0)) {
767 767 (void) mutex_init(&mdmn_busy_mutex[setno], USYNC_THREAD, NULL);
768 768 (void) cond_init(&mdmn_busy_cv[setno], USYNC_THREAD, NULL);
769 769 (void) rwlock_init(&client_rwlock[setno], USYNC_THREAD, NULL);
770 770 (void) rwlock_init(&set_desc_rwlock[setno], USYNC_THREAD, NULL);
771 771
772 772 for (class = MD_MSG_CLASS1; class < MD_MN_NCLASSES; class++) {
773 773 (void) mutex_init(mdmn_get_master_table_mx(setno,
774 774 class), USYNC_THREAD, NULL);
775 775 (void) cond_init(mdmn_get_master_table_cv(setno, class),
776 776 USYNC_THREAD, NULL);
777 777 (void) mutex_init(mdmn_get_initiator_table_mx(setno,
778 778 class), USYNC_THREAD, NULL);
779 779 }
780 780 md_mn_set_inited[setno] |= MDMN_SET_MUTEXES;
781 781 }
782 782 if ((todo & MDMN_SET_MCT) &&
783 783 ((md_mn_set_inited[setno] & MDMN_SET_MCT) == 0)) {
784 784 int fd;
785 785 size_t filesize;
786 786 caddr_t addr;
787 787 char table_name[32];
788 788 struct flock fl;
789 789
790 790 filesize = (sizeof (md_mn_mct_t));
791 791 (void) snprintf(table_name, sizeof (table_name), "%s%d",
792 792 MD_MN_MSG_COMP_TABLE, setno);
793 793 /*
794 794 * If the mct file exists we map it into memory.
795 795 * Otherwise we create an empty file of appropriate
796 796 * size and map that into memory.
797 797 * The mapped areas are stored in mct[setno].
798 798 */
799 799 fd = open(table_name, O_RDWR|O_CREAT|O_DSYNC, 0600);
800 800 if (fd < 0) {
801 801 commd_debug(MD_MMV_MISC,
802 802 "init_set: Can't open MCT\n");
803 803 return (-1);
804 804 }
805 805 /*
806 806 * Ensure that we are the only process that has this file
807 807 * mapped. If another instance of rpc.mdcommd has beaten us
808 808 * then we display the failing process and attempt to terminate
809 809 * it. The next call of this routine should establish us as
810 810 * the only rpc.mdcommd on the system.
811 811 */
812 812 (void) memset(&fl, 0, sizeof (fl));
813 813 fl.l_type = F_WRLCK;
814 814 fl.l_whence = SEEK_SET;
815 815 fl.l_start = 0;
816 816 fl.l_len = filesize + 1;
817 817
818 818 if (fcntl(fd, F_SETLK, &fl) == -1) {
819 819 commd_debug(MD_MMV_SYSLOG,
820 820 "init_set: Cannot lock MCT '%s'\n", table_name);
821 821 if (fcntl(fd, F_GETLK, &fl) != -1) {
822 822 commd_debug(MD_MMV_SYSLOG, "rpc.mdcommd:"
823 823 "Process %d holds lock\n", fl.l_pid);
824 824 (void) close(fd);
825 825 } else {
826 826 commd_debug(MD_MMV_SYSLOG, "rpc.mdcommd:"
827 827 "F_GETLK failed\n");
828 828 (void) close(fd);
829 829 return (-1);
830 830 }
831 831
832 832 /*
833 833 * Try to terminate other mdcommd process so that we
834 834 * can establish ourselves.
835 835 */
836 836 if (sigsend(P_PID, fl.l_pid, 0) == 0) {
837 837 if (sigsend(P_PID, fl.l_pid, SIGKILL) < 0) {
838 838 commd_debug(MD_MMV_SYSLOG,
839 839 "rpc.mdcommd:"
840 840 "SIGKILL of %d failed\n", fl.l_pid);
841 841 } else {
842 842 commd_debug(MD_MMV_SYSLOG,
843 843 "rpc.mdcommd:"
844 844 "Process %d killed\n", fl.l_pid);
845 845 }
846 846 } else {
847 847 commd_debug(MD_MMV_SYSLOG, "rpc.mdcommd:"
848 848 "Process %d not killable\n", fl.l_pid);
849 849 }
850 850 return (-1);
851 851 }
852 852 /*
853 853 * To ensure that the file has the appropriate size,
854 854 * we write a byte at the end of the file.
855 855 */
856 856 (void) lseek(fd, filesize + 1, SEEK_SET);
857 857 (void) write(fd, "\0", 1);
858 858
859 859 /* at this point we have a file in place that we can mmap */
860 860 addr = mmap(0, filesize, PROT_READ | PROT_WRITE,
861 861 MAP_SHARED, fd, (off_t)0);
862 862 if (addr == MAP_FAILED) {
863 863 commd_debug(MD_MMV_INIT,
864 864 "init_set: mmap mct error %d\n",
865 865 errno);
866 866 return (-1);
867 867 }
868 868 /* LINTED pointer alignment */
869 869 mct[setno] = (md_mn_mct_t *)addr;
870 870
871 871 /* finally we initialize the mutexes that protect the mct */
872 872 for (class = MD_MSG_CLASS1; class < MD_MN_NCLASSES; class++) {
873 873 (void) mutex_init(&(mct_mutex[setno][class]),
874 874 USYNC_THREAD, NULL);
875 875 }
876 876
877 877 md_mn_set_inited[setno] |= MDMN_SET_MCT;
878 878 }
879 879 /*
880 880 * Check if we are told to setup the nodes and
881 881 * if these are not yet setup
882 882 * (Attention: negative logic here compared to above!)
883 883 */
884 884 if (((todo & MDMN_SET_NODES) == 0) ||
885 885 (md_mn_set_inited[setno] & MDMN_SET_NODES)) {
886 886 return (0); /* success */
887 887 }
888 888
889 889 if ((sp = metasetnosetname(setno, &ep)) == NULL) {
890 890 commd_debug(MD_MMV_SYSLOG,
891 891 "metasetnosetname(%d) returned NULL\n", setno);
892 892 return (MDMNE_NOT_JOINED);
893 893 }
894 894
895 895 /* flush local copy of rpc.metad data */
896 896 metaflushsetname(sp);
897 897
898 898 (void) mutex_lock(&get_setdesc_mutex);
899 899 sd = metaget_setdesc(sp, &ep);
900 900 (void) mutex_unlock(&get_setdesc_mutex);
901 901
902 902 if (sd == NULL) {
903 903 commd_debug(MD_MMV_SYSLOG,
904 904 "metaget_setdesc(%d) returned NULL\n", setno);
905 905 return (MDMNE_NOT_JOINED);
906 906 }
907 907
908 908 /*
909 909 * if this set is not a multinode set or
910 910 * this node didn't join yet the diskset, better don't do anything
911 911 */
912 912 if ((MD_MNSET_DESC(sd) == 0) ||
913 913 (sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN) == 0) {
914 914 commd_debug(MD_MMV_INIT, "didn't yet join set %d\n", setno);
915 915 return (MDMNE_NOT_JOINED);
916 916 }
917 917
918 918 for (node = sd->sd_nodelist; node != NULL; node = node->nd_next) {
919 919 time_t tout = 0;
920 920 nid = node->nd_nodeid;
921 921
922 922 commd_debug(MD_MMV_INIT,
923 923 "setting up: node=%s, priv_ic=%s, flags=0x%x\n",
924 924 node->nd_nodename ? node->nd_nodename : "NULL",
925 925 node->nd_priv_ic ? node->nd_priv_ic : "NULL",
926 926 node->nd_flags);
927 927
928 928 if ((node->nd_flags & MD_MN_NODE_OWN) == 0) {
929 929 commd_debug(MD_MMV_INIT,
930 930 "init: %s didn't join set %d\n",
931 931 node->nd_nodename ? node->nd_nodename : "NULL",
932 932 setno);
933 933 continue;
934 934 }
935 935
936 936 if (client[setno][nid] != (CLIENT *) NULL) {
937 937 /* already inited */
938 938 commd_debug(MD_MMV_INIT, "init: already: node=%s\n",
939 939 node->nd_nodename ? node->nd_nodename : "NULL");
940 940 continue;
941 941 }
942 942
943 943 /*
944 944 * While trying to create a connection to a node,
945 945 * periodically check to see if the node has been marked
946 946 * dead by the SunCluster infrastructure.
947 947 * This periodic check is needed since a non-responsive
948 948 * rpc.mdcommd (while it is attempting to create a connection
949 949 * to a dead node) can lead to large delays and/or failures
950 950 * in the reconfig steps.
951 951 */
952 952 while ((client[setno][nid] == (CLIENT *) NULL) &&
953 953 (tout < MD_CLNT_CREATE_TOUT)) {
954 954 client[setno][nid] = meta_client_create_retry(
955 955 node->nd_nodename, mdmn_clnt_create,
956 956 (void *) node, MD_CLNT_CREATE_SUBTIMEOUT, &ep);
957 957 /* Is the node dead? */
958 958 if (mdmn_is_node_dead(node) == 1) {
959 959 commd_debug(MD_MMV_SYSLOG,
960 960 "rpc.mdcommd: no client for dead node %s\n",
961 961 node->nd_nodename);
962 962 break;
963 963 } else
964 964 tout += MD_CLNT_CREATE_SUBTIMEOUT;
965 965 }
966 966
967 967 if (client[setno][nid] == (CLIENT *) NULL) {
968 968 clnt_pcreateerror(node->nd_nodename);
969 969 /*
970 970 * If we cannot connect to a single node
971 971 * (maybe because it is down) we mark this node as not
972 972 * owned and continue with the next node in the list.
973 973 * This is better than failing the entire starting up
974 974 * of the commd system.
975 975 */
976 976 node->nd_flags &= ~MD_MN_NODE_OWN;
977 977 commd_debug(MD_MMV_SYSLOG,
978 978 "WARNING couldn't create client for %s\n"
979 979 "Reconfig cycle required\n",
980 980 node->nd_nodename);
981 981 commd_debug(MD_MMV_INIT,
982 982 "WARNING couldn't create client for %s\n"
983 983 "Reconfig cycle required\n",
984 984 node->nd_nodename);
985 985 continue;
986 986 }
987 987 /* this node has the license to send */
988 988 commd_debug(MD_MMV_MISC, "init_set: calling add_lic\n");
989 989 add_license(node);
990 990
991 991 /* set the timeout value */
992 992 clnt_control(client[setno][nid], CLSET_TIMEOUT,
993 993 (char *)&FOUR_SECS);
994 994
995 995 commd_debug(MD_MMV_INIT, "init: done: node=%s\n",
996 996 node->nd_nodename ? node->nd_nodename : "NULL");
997 997 }
998 998
999 999 set_descriptor[setno] = sd;
1000 1000 md_mn_set_inited[setno] |= MDMN_SET_NODES;
1001 1001 return (0); /* success */
1002 1002 }
1003 1003
1004 1004 void *
1005 1005 mdmn_send_to_work(void *arg)
1006 1006 {
1007 1007 int *rpc_err = NULL;
1008 1008 int success;
1009 1009 int try_master;
1010 1010 set_t setno;
1011 1011 mutex_t *mx; /* protection for initiator_table */
1012 1012 SVCXPRT *transp;
1013 1013 md_mn_msg_t *msg;
1014 1014 md_mn_nodeid_t set_master;
1015 1015 md_mn_msgclass_t class;
1016 1016 md_mn_msg_and_transp_t *matp = (md_mn_msg_and_transp_t *)arg;
1017 1017
1018 1018 msg = matp->mat_msg;
1019 1019 transp = matp->mat_transp;
1020 1020
1021 1021 class = mdmn_get_message_class(msg->msg_type);
1022 1022 setno = msg->msg_setno;
1023 1023
1024 1024 /* set the sender, so the master knows who to send the results */
1025 1025 (void) rw_rdlock(&set_desc_rwlock[setno]);
1026 1026 msg->msg_sender = set_descriptor[setno]->sd_mn_mynode->nd_nodeid;
1027 1027 set_master = set_descriptor[setno]->sd_mn_master_nodeid;
1028 1028
1029 1029 mx = mdmn_get_initiator_table_mx(setno, class);
1030 1030 (void) mutex_lock(mx);
1031 1031
1032 1032 /*
1033 1033 * Here we check, if the initiator table slot for this set/class
1034 1034 * combination is free to use.
1035 1035 * If this is not the case, we return CLASS_BUSY forcing the
1036 1036 * initiating send_message call to retry
1037 1037 */
1038 1038 success = mdmn_check_initiator_table(setno, class);
1039 1039 if (success == MDMNE_CLASS_BUSY) {
1040 1040 md_mn_msgid_t active_mid;
1041 1041
1042 1042 mdmn_get_initiator_table_id(setno, class, &active_mid);
1043 1043
1044 1044 commd_debug(MD_MMV_SEND,
1045 1045 "send_to_work: received but locally busy "
1046 1046 "(%d, 0x%llx-%d), set=%d, class=%d, type=%d, "
1047 1047 "active msg=(%d, 0x%llx-%d)\n",
1048 1048 MSGID_ELEMS(msg->msg_msgid), setno, class,
1049 1049 msg->msg_type, MSGID_ELEMS(active_mid));
1050 1050 } else {
1051 1051 commd_debug(MD_MMV_SEND,
1052 1052 "send_to_work: received (%d, 0x%llx-%d), "
1053 1053 "set=%d, class=%d, type=%d\n",
1054 1054 MSGID_ELEMS(msg->msg_msgid), setno, class, msg->msg_type);
1055 1055 }
1056 1056
1057 1057 try_master = 2; /* return failure after two retries */
1058 1058 while ((success == MDMNE_ACK) && (try_master--)) {
1059 1059 (void) rw_rdlock(&client_rwlock[setno]);
1060 1060 /* is the rpc client to the master still around ? */
1061 1061 if (check_client(setno, set_master)) {
1062 1062 success = MDMNE_RPC_FAIL;
1063 1063 FLUSH_DEBUGFILE();
1064 1064 (void) rw_unlock(&client_rwlock[setno]);
1065 1065 break; /* out of try_master-loop */
1066 1066 }
1067 1067
1068 1068 /*
1069 1069 * Send the request to the work function on the master
1070 1070 * this call will return immediately
1071 1071 */
1072 1072 rpc_err = mdmn_work_2(msg, client[setno][set_master],
1073 1073 set_master);
1074 1074
1075 1075 /* Everything's Ok? */
1076 1076 if (rpc_err == NULL) {
1077 1077 success = MDMNE_RPC_FAIL;
1078 1078 /*
1079 1079 * Probably something happened to the daemon on the
1080 1080 * master. Kill the client, and try again...
1081 1081 */
1082 1082 (void) rw_unlock(&client_rwlock[setno]);
1083 1083 (void) rw_wrlock(&client_rwlock[setno]);
1084 1084 mdmn_clnt_destroy(client[setno][set_master]);
1085 1085 if (client[setno][set_master] != (CLIENT *)NULL) {
1086 1086 client[setno][set_master] = (CLIENT *)NULL;
1087 1087 }
1088 1088 (void) rw_unlock(&client_rwlock[setno]);
1089 1089 continue;
1090 1090
1091 1091 } else if (*rpc_err != MDMNE_ACK) {
1092 1092 /* something went wrong, break out */
1093 1093 success = *rpc_err;
1094 1094 free(rpc_err);
1095 1095 (void) rw_unlock(&client_rwlock[setno]);
1096 1096 break; /* out of try_master-loop */
1097 1097 }
1098 1098
1099 1099 (void) rw_unlock(&client_rwlock[setno]);
1100 1100 free(rpc_err);
1101 1101
1102 1102 /*
1103 1103 * If we are here, we sucessfully delivered the message.
1104 1104 * We register the initiator_table, so that
1105 1105 * wakeup_initiator_2 can do the sendreply with the
1106 1106 * results for us.
1107 1107 */
1108 1108 success = MDMNE_ACK;
1109 1109 mdmn_register_initiator_table(setno, class, msg, transp);
1110 1110
1111 1111 /* tell check_timeouts, there's work to do */
1112 1112 (void) mutex_lock(&check_timeout_mutex);
1113 1113 messages_on_their_way++;
1114 1114 (void) cond_signal(&check_timeout_cv);
1115 1115 (void) mutex_unlock(&check_timeout_mutex);
1116 1116 break; /* out of try_master-loop */
1117 1117 }
1118 1118
1119 1119 (void) rw_unlock(&set_desc_rwlock[setno]);
1120 1120
1121 1121 if (success == MDMNE_ACK) {
1122 1122 commd_debug(MD_MMV_SEND,
1123 1123 "send_to_work: registered (%d, 0x%llx-%d)\n",
1124 1124 MSGID_ELEMS(msg->msg_msgid));
1125 1125 } else {
1126 1126 /* In case of failure do the sendreply now */
1127 1127 md_mn_result_t *resultp;
1128 1128 resultp = Zalloc(sizeof (md_mn_result_t));
1129 1129 resultp->mmr_comm_state = success;
1130 1130 /*
1131 1131 * copy the MSGID so that we know _which_ message
1132 1132 * failed (if the transp has got mangled)
1133 1133 */
1134 1134 MSGID_COPY(&(msg->msg_msgid), &(resultp->mmr_msgid));
1135 1135 mdmn_svc_sendreply(transp, xdr_md_mn_result_t, (char *)resultp);
1136 1136 commd_debug(MD_MMV_SEND,
1137 1137 "send_to_work: not registered (%d, 0x%llx-%d) cs=%d\n",
1138 1138 MSGID_ELEMS(msg->msg_msgid), success);
1139 1139 free_result(resultp);
1140 1140 /*
1141 1141 * We don't have a timeout registered to wake us up, so we're
1142 1142 * now done with this handle. Release it back to the pool.
1143 1143 */
1144 1144 svc_done(transp);
1145 1145
1146 1146 }
1147 1147
1148 1148 free_msg(msg);
1149 1149 /* the alloc was done in mdmn_send_svc_2 */
1150 1150 Free(matp);
1151 1151 (void) mutex_unlock(mx);
1152 1152 return (NULL);
1153 1153
1154 1154 }
1155 1155
1156 1156 /*
1157 1157 * do_message_locally(msg, result)
1158 1158 * Process a message locally on the master
1159 1159 * Lookup the MCT if the message has already been processed.
1160 1160 * If not, call the handler and store the result
1161 1161 * If yes, retrieve the result from the MCT.
1162 1162 * Return:
1163 1163 * MDMNE_ACK in case of success
1164 1164 * MDMNE_LOG_FAIL if the MCT could not be checked
1165 1165 */
1166 1166 static int
1167 1167 do_message_locally(md_mn_msg_t *msg, md_mn_result_t *result)
1168 1168 {
1169 1169 int completed;
1170 1170 set_t setno;
1171 1171 md_mn_msgtype_t msgtype = msg->msg_type;
1172 1172 md_mn_msgclass_t class;
1173 1173
1174 1174 void (*handler)(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *res);
1175 1175
1176 1176 handler = mdmn_get_handler(msgtype);
1177 1177 if (handler == NULL) {
1178 1178 result->mmr_exitval = 0;
1179 1179 /* let the sender decide if this is an error or not */
1180 1180 result->mmr_comm_state = MDMNE_NO_HANDLER;
1181 1181 return (MDMNE_NO_HANDLER);
1182 1182 }
1183 1183
1184 1184 class = mdmn_get_message_class(msg->msg_type);
1185 1185 setno = msg->msg_setno;
1186 1186
1187 1187 result->mmr_msgtype = msgtype;
1188 1188 result->mmr_flags = msg->msg_flags;
1189 1189 MSGID_COPY(&(msg->msg_msgid), &(result->mmr_msgid));
1190 1190
1191 1191 (void) mutex_lock(&mct_mutex[setno][class]);
1192 1192 completed = mdmn_check_completion(msg, result);
1193 1193 if (completed == MDMN_MCT_NOT_DONE) {
1194 1194 /* message not yet processed locally */
1195 1195 commd_debug(MD_MMV_PROC_M, "proc_mas: "
1196 1196 "calling handler for (%d,0x%llx-%d) type %d\n",
1197 1197 MSGID_ELEMS(msg->msg_msgid), msgtype);
1198 1198
1199 1199 /*
1200 1200 * Mark the message as being currently processed,
1201 1201 * so we won't start a second handler for it
1202 1202 */
1203 1203 (void) mdmn_mark_completion(msg, NULL, MDMN_MCT_IN_PROGRESS);
1204 1204 (void) mutex_unlock(&mct_mutex[setno][class]);
1205 1205
1206 1206 /* here we actually process the message on the master */
1207 1207 (*handler)(msg, MD_MSGF_ON_MASTER, result);
1208 1208
1209 1209 commd_debug(MD_MMV_PROC_M, "proc_mas: "
1210 1210 "finished handler for (%d,0x%llx-%d) type %d\n",
1211 1211 MSGID_ELEMS(msg->msg_msgid), msgtype);
1212 1212
1213 1213 /* Mark the message as fully processed, store the result */
1214 1214 (void) mutex_lock(&mct_mutex[setno][class]);
1215 1215 (void) mdmn_mark_completion(msg, result, MDMN_MCT_DONE);
1216 1216 } else if (completed == MDMN_MCT_DONE) {
1217 1217 commd_debug(MD_MMV_PROC_M, "proc_mas: "
1218 1218 "result for (%d, 0x%llx-%d) from MCT\n",
1219 1219 MSGID_ELEMS(msg->msg_msgid), msgtype);
1220 1220 } else if (completed == MDMN_MCT_IN_PROGRESS) {
1221 1221 commd_debug(MD_MMV_PROC_M, "proc_mas: "
1222 1222 "(%d, 0x%llx-%d) is currently being processed\n",
1223 1223 MSGID_ELEMS(msg->msg_msgid), msgtype);
1224 1224 } else {
1225 1225 /* MCT error occurred (should never happen) */
1226 1226 (void) mutex_unlock(&mct_mutex[setno][class]);
1227 1227 result->mmr_comm_state = MDMNE_LOG_FAIL;
1228 1228 commd_debug(MD_MMV_SYSLOG, "WARNING "
1229 1229 "mdmn_check_completion returned %d "
1230 1230 "for (%d,0x%llx-%d)\n", completed,
1231 1231 MSGID_ELEMS(msg->msg_msgid));
1232 1232 return (MDMNE_LOG_FAIL);
1233 1233 }
1234 1234 (void) mutex_unlock(&mct_mutex[setno][class]);
1235 1235 return (MDMNE_ACK);
1236 1236
1237 1237 }
1238 1238
1239 1239 /*
1240 1240 * do_send_message(msg, node)
1241 1241 *
1242 1242 * Send a message to a given node and wait for a acknowledgment, that the
1243 1243 * message has arrived on the remote node.
1244 1244 * Make sure that the client for the set is setup correctly.
1245 1245 * If no ACK arrives, destroy and recreate the RPC client and retry the
1246 1246 * message one time
1247 1247 * After actually sending wait no longer than the appropriate number of
1248 1248 * before timing out the message.
1249 1249 *
1250 1250 * Note must be called with set_desc_wrlock held in reader mode
1251 1251 */
1252 1252 static int
1253 1253 do_send_message(md_mn_msg_t *msg, md_mnnode_desc *node)
1254 1254 {
1255 1255 int err;
1256 1256 int rpc_retries;
1257 1257 int timeout_retries = 0;
1258 1258 int *ret = NULL;
1259 1259 set_t setno;
1260 1260 cond_t *cv; /* see mdmn_wakeup_master_svc_2 */
1261 1261 mutex_t *mx; /* protection for class_busy */
1262 1262 timestruc_t timeout; /* surveillance for remote daemon */
1263 1263 md_mn_nodeid_t nid;
1264 1264 md_mn_msgtype_t msgtype;
1265 1265 md_mn_msgclass_t class;
1266 1266
1267 1267 nid = node->nd_nodeid;
1268 1268 msgtype = msg->msg_type;
1269 1269 setno = msg->msg_setno;
1270 1270 class = mdmn_get_message_class(msgtype);
1271 1271 mx = mdmn_get_master_table_mx(setno, class);
1272 1272 cv = mdmn_get_master_table_cv(setno, class);
1273 1273
1274 1274 retry_rpc:
1275 1275
1276 1276 /* We try two times to send the message */
1277 1277 rpc_retries = 2;
1278 1278
1279 1279 /*
1280 1280 * if sending the message doesn't succeed the first time due to a
1281 1281 * RPC problem, we retry one time
1282 1282 */
1283 1283 while ((rpc_retries != 0) && (ret == NULL)) {
1284 1284 /* in abort state, we error out immediately */
1285 1285 if (md_commd_global_state & MD_CGS_ABORTED) {
1286 1286 return (MDMNE_ABORT);
1287 1287 }
1288 1288
1289 1289 (void) rw_rdlock(&client_rwlock[setno]);
1290 1290 /* unable to create client? Ignore it */
1291 1291 if (check_client(setno, nid)) {
1292 1292 /*
1293 1293 * In case we cannot establish an RPC client, we
1294 1294 * take this node out of our considerations.
1295 1295 * This will be reset by a reconfig
1296 1296 * cycle that should come pretty soon.
1297 1297 * MNISSUE: Should a reconfig cycle
1298 1298 * be forced on SunCluster?
1299 1299 */
1300 1300 node->nd_flags &= ~MD_MN_NODE_OWN;
1301 1301 commd_debug(MD_MMV_SYSLOG,
1302 1302 "WARNING couldn't create client for %s\n"
1303 1303 "Reconfig cycle required\n",
1304 1304 node->nd_nodename);
1305 1305 commd_debug(MD_MMV_PROC_M, "proc_mas: (%d,0x%llx-%d) "
1306 1306 "WARNING couldn't create client for %s\n",
1307 1307 MSGID_ELEMS(msg->msg_msgid), node->nd_nodename);
1308 1308 (void) rw_unlock(&client_rwlock[setno]);
1309 1309 return (MDMNE_IGNORE_NODE);
1310 1310 }
1311 1311 /* let's be paranoid and check again before sending */
1312 1312 if (client[setno][nid] == NULL) {
1313 1313 /*
1314 1314 * if this is true, strange enough, we catch our breath,
1315 1315 * and then continue, so that the client is set up
1316 1316 * once again.
1317 1317 */
1318 1318 commd_debug(MD_MMV_PROC_M, "client is NULL\n");
1319 1319 (void) rw_unlock(&client_rwlock[setno]);
1320 1320 (void) sleep(1);
1321 1321 continue;
1322 1322 }
1323 1323
1324 1324 /* send it over, it will return immediately */
1325 1325 ret = mdmn_work_2(msg, client[setno][nid], nid);
1326 1326
1327 1327 (void) rw_unlock(&client_rwlock[setno]);
1328 1328
1329 1329 if (ret != NULL) {
1330 1330 commd_debug(MD_MMV_PROC_M,
1331 1331 "proc_mas: sending (%d,0x%llx-%d) to %d returned "
1332 1332 " 0x%x\n",
1333 1333 MSGID_ELEMS(msg->msg_msgid), nid, *ret);
1334 1334 } else {
1335 1335 commd_debug(MD_MMV_PROC_M,
1336 1336 "proc_mas: sending (%d,0x%llx-%d) to %d returned "
1337 1337 " NULL \n",
1338 1338 MSGID_ELEMS(msg->msg_msgid), nid);
1339 1339 }
1340 1340
1341 1341 if ((ret == NULL) || (*ret == MDMNE_CANNOT_CONNECT) ||
1342 1342 (*ret == MDMNE_THR_CREATE_FAIL)) {
1343 1343 /*
1344 1344 * Something happened to the daemon on the other side.
1345 1345 * Kill the client, and try again.
1346 1346 * check_client() will create a new client
1347 1347 */
1348 1348 (void) rw_wrlock(&client_rwlock[setno]);
1349 1349 mdmn_clnt_destroy(client[setno][nid]);
1350 1350 if (client[setno][nid] != (CLIENT *)NULL) {
1351 1351 client[setno][nid] = (CLIENT *)NULL;
1352 1352 }
1353 1353 (void) rw_unlock(&client_rwlock[setno]);
1354 1354
1355 1355 /* ... but don't try infinitely */
1356 1356 --rpc_retries;
1357 1357 continue;
1358 1358 }
1359 1359 /*
1360 1360 * If the class is locked on the other node, keep trying.
1361 1361 * This situation will go away automatically,
1362 1362 * if we wait long enough
1363 1363 */
1364 1364 if (*ret == MDMNE_CLASS_LOCKED) {
1365 1365 (void) sleep(1);
1366 1366 free(ret);
1367 1367 ret = NULL;
1368 1368 continue;
1369 1369 }
1370 1370 }
1371 1371 if (ret == NULL) {
1372 1372 return (MDMNE_RPC_FAIL);
1373 1373 }
1374 1374
1375 1375
1376 1376 /* if the slave is in abort state, we just ignore it. */
1377 1377 if (*ret == MDMNE_ABORT) {
1378 1378 commd_debug(MD_MMV_PROC_M,
1379 1379 "proc_mas: work(%d,0x%llx-%d) returned "
1380 1380 "MDMNE_ABORT\n",
1381 1381 MSGID_ELEMS(msg->msg_msgid));
1382 1382 free(ret);
1383 1383 return (MDMNE_IGNORE_NODE);
1384 1384 }
1385 1385
1386 1386 /* Did the remote processing succeed? */
1387 1387 if (*ret != MDMNE_ACK) {
1388 1388 /*
1389 1389 * Some commd failure in the middle of sending the msg
1390 1390 * to the nodes. We don't continue here.
1391 1391 */
1392 1392 commd_debug(MD_MMV_PROC_M,
1393 1393 "proc_mas: work(%d,0x%llx-%d) returns %d\n",
1394 1394 MSGID_ELEMS(msg->msg_msgid), *ret);
1395 1395 free(ret);
1396 1396 return (MDMNE_RPC_FAIL);
1397 1397 }
1398 1398 free(ret);
1399 1399 ret = NULL;
1400 1400
1401 1401 /*
1402 1402 * When we are here, we have sent the message to the other node and
1403 1403 * we know that node has accepted it.
1404 1404 * We go to sleep and have trust to be woken up by wakeup.
1405 1405 * If we wakeup due to a timeout, or a signal, no result has been
1406 1406 * placed in the appropriate slot.
1407 1407 * If we timeout, it is likely that this is because the node has
1408 1408 * gone away, so we will destroy the client and try it again in the
1409 1409 * expectation that the rpc will fail and we will return
1410 1410 * MDMNE_IGNORE_NODE. If that is not the case, the message must still
1411 1411 * be being processed on the slave. In this case just timeout for 4
1412 1412 * more seconds and then return RPC_FAIL if the message is not complete.
1413 1413 */
1414 1414 timeout.tv_nsec = 0;
1415 1415 timeout.tv_sec = (timeout_retries == 0) ? mdmn_get_timeout(msgtype) :
1416 1416 FOUR_SECS.tv_sec;
1417 1417 err = cond_reltimedwait(cv, mx, &timeout);
1418 1418
1419 1419 if (err == 0) {
1420 1420 /* everything's fine, return success */
1421 1421 return (MDMNE_ACK);
1422 1422 }
1423 1423
1424 1424 if (err == ETIME) {
1425 1425 commd_debug(MD_MMV_PROC_M, "proc_mas: "
1426 1426 "timeout occured, set=%d, class=%d, "
1427 1427 "msgid=(%d, 0x%llx-%d), timeout_retries=%d\n",
1428 1428 setno, class, MSGID_ELEMS(msg->msg_msgid), timeout_retries);
1429 1429 if (timeout_retries == 0) {
1430 1430 timeout_retries++;
1431 1431 /*
1432 1432 * Destroy the client and try the rpc call again
1433 1433 */
1434 1434 (void) rw_wrlock(&client_rwlock[setno]);
1435 1435 mdmn_clnt_destroy(client[setno][nid]);
1436 1436 client[setno][nid] = (CLIENT *)NULL;
1437 1437 (void) rw_unlock(&client_rwlock[setno]);
1438 1438 goto retry_rpc;
1439 1439 }
1440 1440 } else if (err == EINTR) {
1441 1441 commd_debug(MD_MMV_PROC_M, "proc_mas: "
1442 1442 "commd signalled, set=%d, class=%d, "
1443 1443 "msgid=(%d, 0x%llx-%d)\n",
1444 1444 setno, class, MSGID_ELEMS(msg->msg_msgid));
1445 1445 } else {
1446 1446 commd_debug(MD_MMV_PROC_M, "proc_mas: "
1447 1447 "cond_reltimedwait err=%d, set=%d, "
1448 1448 "class=%d, msgid=(%d, 0x%llx-%d)\n",
1449 1449 err, setno, class,
1450 1450 MSGID_ELEMS(msg->msg_msgid));
1451 1451 }
1452 1452
1453 1453 /* some failure happened */
1454 1454 return (MDMNE_RPC_FAIL);
1455 1455 }
1456 1456
1457 1457 /*
1458 1458 * before we return we have to
1459 1459 * free_msg(msg); because we are working on a copied message
1460 1460 */
1461 1461 void
1462 1462 mdmn_master_process_msg(md_mn_msg_t *msg)
1463 1463 {
1464 1464 int *ret;
1465 1465 int err;
1466 1466 int nmsgs; /* total number of msgs */
1467 1467 int curmsg; /* index of current msg */
1468 1468 set_t setno;
1469 1469 uint_t inherit_flags = 0;
1470 1470 uint_t secdiff, usecdiff; /* runtime of this message */
1471 1471 md_error_t mde = mdnullerror;
1472 1472 md_mn_msg_t *msglist[MAX_SUBMESSAGES]; /* all msgs to process */
1473 1473 md_mn_msg_t *cmsg; /* current msg */
1474 1474 md_mn_msgid_t dummyid;
1475 1475 md_mn_result_t *result;
1476 1476 md_mn_result_t *slave_result;
1477 1477 md_mn_nodeid_t sender;
1478 1478 md_mn_nodeid_t set_master;
1479 1479 md_mnnode_desc *node;
1480 1480 md_mn_msgtype_t orig_type; /* type of the original message */
1481 1481 md_mn_msgtype_t msgtype; /* type of the current message */
1482 1482 md_mn_msgclass_t orig_class; /* class of the original message */
1483 1483 md_mn_msgclass_t class; /* class of the current message */
1484 1484
1485 1485 int (*smgen)(md_mn_msg_t *msg, md_mn_msg_t **msglist);
1486 1486
1487 1487 orig_type = msgtype = msg->msg_type;
1488 1488 sender = msg->msg_sender;
1489 1489 setno = msg->msg_setno;
1490 1490
1491 1491 result = Zalloc(sizeof (md_mn_result_t));
1492 1492 result->mmr_setno = setno;
1493 1493 result->mmr_msgtype = msgtype;
1494 1494 MSGID_COPY(&(msg->msg_msgid), &(result->mmr_msgid));
1495 1495
1496 1496 orig_class = mdmn_get_message_class(msgtype);
1497 1497
1498 1498 commd_debug(MD_MMV_PROC_M,
1499 1499 "proc_mas: received (%d, 0x%llx-%d) set=%d, class=%d, type=%d\n",
1500 1500 MSGID_ELEMS(msg->msg_msgid), setno, orig_class, msgtype);
1501 1501
1502 1502 (void) rw_rdlock(&set_desc_rwlock[setno]);
1503 1503 set_master = set_descriptor[setno]->sd_mn_master_nodeid;
1504 1504 result->mmr_sender = set_master;
1505 1505 /*
1506 1506 * Put message into the change log unless told otherwise
1507 1507 * Note that we only log original messages.
1508 1508 * If they are generated by some smgen, we don't log them!
1509 1509 * Replay messages aren't logged either.
1510 1510 * Note, that replay messages are unlogged on completion.
1511 1511 */
1512 1512 if ((msg->msg_flags & (MD_MSGF_NO_LOG | MD_MSGF_REPLAY_MSG)) == 0) {
1513 1513 commd_debug(MD_MMV_PROC_M,
1514 1514 "proc_mas: calling log_msg for (%d,0x%llx-%d) type %d\n",
1515 1515 MSGID_ELEMS(msg->msg_msgid), msgtype);
1516 1516 err = mdmn_log_msg(msg);
1517 1517 if (err == MDMNE_NULL) {
1518 1518 /* msg logged successfully */
1519 1519 commd_debug(MD_MMV_PROC_M, "proc_mas: "
1520 1520 "done log_msg for (%d,0x%llx-%d) type %d\n",
1521 1521 MSGID_ELEMS(msg->msg_msgid), msgtype);
1522 1522 goto proceed;
1523 1523 }
1524 1524 if (err == MDMNE_ACK) {
1525 1525 /* Same msg in the slot, proceed */
1526 1526 commd_debug(MD_MMV_PROC_M, "proc_mas: "
1527 1527 "already logged (%d,0x%llx-%d) type %d\n",
1528 1528 MSGID_ELEMS(msg->msg_msgid), msgtype);
1529 1529 goto proceed;
1530 1530 }
1531 1531 if (err == MDMNE_LOG_FAIL) {
1532 1532 /* Oh, bad, the log is non functional. */
1533 1533 result->mmr_comm_state = MDMNE_LOG_FAIL;
1534 1534 /*
1535 1535 * Note that the mark_busy was already done by
1536 1536 * mdmn_work_svc_2()
1537 1537 */
1538 1538 (void) mutex_lock(&mdmn_busy_mutex[setno]);
1539 1539 mdmn_mark_class_unbusy(setno, orig_class);
1540 1540 (void) mutex_unlock(&mdmn_busy_mutex[setno]);
1541 1541
1542 1542 }
1543 1543 if (err == MDMNE_CLASS_BUSY) {
1544 1544 /*
1545 1545 * The log is occupied with a different message
1546 1546 * that needs to be played first.
1547 1547 * We reject the current message with MDMNE_CLASS_BUSY
1548 1548 * to the initiator and do not unbusy the set/class,
1549 1549 * because we will proceed with the logged message,
1550 1550 * which has the same set/class combination
1551 1551 */
1552 1552 result->mmr_comm_state = MDMNE_CLASS_BUSY;
1553 1553 }
1554 1554 ret = (int *)NULL;
1555 1555 (void) rw_rdlock(&client_rwlock[setno]);
1556 1556
1557 1557 if (check_client(setno, sender)) {
1558 1558 commd_debug(MD_MMV_SYSLOG,
1559 1559 "proc_mas: No client for initiator \n");
1560 1560 } else {
1561 1561 ret = mdmn_wakeup_initiator_2(result,
1562 1562 client[setno][sender], sender);
1563 1563 }
1564 1564 (void) rw_unlock(&client_rwlock[setno]);
1565 1565
1566 1566 if (ret == (int *)NULL) {
1567 1567 commd_debug(MD_MMV_SYSLOG,
1568 1568 "proc_mas: couldn't wakeup_initiator \n");
1569 1569 } else {
1570 1570 if (*ret != MDMNE_ACK) {
1571 1571 commd_debug(MD_MMV_SYSLOG, "proc_mas: "
1572 1572 "wakeup_initiator returned %d\n", *ret);
1573 1573 }
1574 1574 free(ret);
1575 1575 }
1576 1576 free_msg(msg);
1577 1577
1578 1578 if (err == MDMNE_LOG_FAIL) {
1579 1579 /* we can't proceed here */
1580 1580 free_result(result);
1581 1581 (void) rw_unlock(&set_desc_rwlock[setno]);
1582 1582 return;
1583 1583 } else if (err == MDMNE_CLASS_BUSY) {
1584 1584 mdmn_changelog_record_t *lr;
1585 1585 lr = mdmn_get_changelogrec(setno, orig_class);
1586 1586 assert(lr != NULL);
1587 1587
1588 1588 /* proceed with the logged message */
1589 1589 msg = copy_msg(&(lr->lr_msg), NULL);
1590 1590
1591 1591 /*
1592 1592 * The logged message has to have the same class but
1593 1593 * type and sender can be different
1594 1594 */
1595 1595 orig_type = msgtype = msg->msg_type;
1596 1596 sender = msg->msg_sender;
1597 1597
1598 1598 commd_debug(MD_MMV_PROC_M,
1599 1599 "proc_mas: Got new message from change log: "
1600 1600 "(%d,0x%llx-%d) type %d\n",
1601 1601 MSGID_ELEMS(msg->msg_msgid), msgtype);
1602 1602
1603 1603 /* continue normal operation with this message */
1604 1604 }
1605 1605 }
1606 1606
1607 1607 proceed:
1608 1608 smgen = mdmn_get_submessage_generator(msgtype);
1609 1609 if (smgen == NULL) {
1610 1610 /* no submessages to create, just use the original message */
1611 1611 msglist[0] = msg;
1612 1612 nmsgs = 1;
1613 1613 } else {
1614 1614 /* some bits are passed on to submessages */
1615 1615 inherit_flags = msg->msg_flags & MD_MSGF_INHERIT_BITS;
1616 1616
1617 1617 nmsgs = smgen(msg, msglist);
1618 1618
1619 1619 /* some settings for the submessages */
1620 1620 for (curmsg = 0; curmsg < nmsgs; curmsg++) {
1621 1621 cmsg = msglist[curmsg];
1622 1622
1623 1623 /* Apply the inherited flags */
1624 1624 cmsg->msg_flags |= inherit_flags;
1625 1625
1626 1626 /*
1627 1627 * Make sure the submessage ID is set correctly
1628 1628 * Note: first submessage has mid_smid of 1 (not 0)
1629 1629 */
1630 1630 cmsg->msg_msgid.mid_smid = curmsg + 1;
1631 1631
1632 1632 /* need the original class set in msgID (for MCT) */
1633 1633 cmsg->msg_msgid.mid_oclass = orig_class;
1634 1634 }
1635 1635
1636 1636 commd_debug(MD_MMV_PROC_M,
1637 1637 "smgen generated %d submsgs, origclass = %d\n",
1638 1638 nmsgs, orig_class);
1639 1639 }
1640 1640 /*
1641 1641 * This big loop does the following.
1642 1642 * For all messages:
1643 1643 * process message on the master first (a message completion
1644 1644 * table MCT ensures a message is not processed twice)
1645 1645 * in case of an error break out of message loop
1646 1646 * for all nodes -- unless MD_MSGF_NO_BCAST is set --
1647 1647 * send message to node until that succeeds
1648 1648 * merge result -- not yet implemented
1649 1649 * respect MD_MSGF_STOP_ON_ERROR
1650 1650 */
1651 1651 for (curmsg = 0; curmsg < nmsgs; curmsg++) {
1652 1652 int break_msg_loop = 0;
1653 1653 mutex_t *mx; /* protection for class_busy */
1654 1654 int master_err;
1655 1655 int master_exitval = -1;
1656 1656
1657 1657 cmsg = msglist[curmsg];
1658 1658 msgtype = cmsg->msg_type;
1659 1659 class = mdmn_get_message_class(msgtype);
1660 1660 node = NULL;
1661 1661 mx = mdmn_get_master_table_mx(setno, class);
1662 1662
1663 1663 /* If we are in the abort state, we error out immediately */
1664 1664 if (md_commd_global_state & MD_CGS_ABORTED) {
1665 1665 break; /* out of the message loop */
1666 1666 }
1667 1667
1668 1668 commd_debug(MD_MMV_PROC_M, "class=%d, orig_class=%d\n",
1669 1669 class, orig_class);
1670 1670 /*
1671 1671 * If the current class is different from the original class,
1672 1672 * we have to lock it down.
1673 1673 * The original class is already marked busy.
1674 1674 * At this point we cannot refuse the message because the
1675 1675 * class is busy right now, so we wait until the class becomes
1676 1676 * available again. As soon as something changes for this set
1677 1677 * we will be cond_signal'ed (in mdmn_mark_class_unbusy)
1678 1678 *
1679 1679 * Granularity could be finer (setno/class)
1680 1680 */
1681 1681 if (class != orig_class) {
1682 1682 (void) mutex_lock(&mdmn_busy_mutex[setno]);
1683 1683 while (mdmn_mark_class_busy(setno, class) == FALSE) {
1684 1684 (void) cond_wait(&mdmn_busy_cv[setno],
1685 1685 &mdmn_busy_mutex[setno]);
1686 1686 }
1687 1687 (void) mutex_unlock(&mdmn_busy_mutex[setno]);
1688 1688 }
1689 1689
1690 1690 master_err = do_message_locally(cmsg, result);
1691 1691
1692 1692 if ((master_err != MDMNE_ACK) ||
1693 1693 ((master_err == MDMNE_ACK) && (result->mmr_exitval != 0))) {
1694 1694 result->mmr_failing_node = set_master;
1695 1695 if (cmsg->msg_flags & MD_MSGF_STOP_ON_ERROR) {
1696 1696 /*
1697 1697 * if appropriate, unbusy the class and
1698 1698 * break out of the message loop
1699 1699 */
1700 1700 if (class != orig_class) {
1701 1701 (void) mutex_lock(
1702 1702 &mdmn_busy_mutex[setno]);
1703 1703 mdmn_mark_class_unbusy(setno, class);
1704 1704 (void) mutex_unlock(
1705 1705 &mdmn_busy_mutex[setno]);
1706 1706 }
1707 1707 break;
1708 1708 }
1709 1709 }
1710 1710
1711 1711 if (master_err == MDMNE_ACK)
1712 1712 master_exitval = result->mmr_exitval;
1713 1713
1714 1714 /* No broadcast? => next message */
1715 1715 if (cmsg->msg_flags & MD_MSGF_NO_BCAST) {
1716 1716 /* if appropriate, unbusy the class */
1717 1717 if (class != orig_class) {
1718 1718 (void) mutex_lock(&mdmn_busy_mutex[setno]);
1719 1719 mdmn_mark_class_unbusy(setno, class);
1720 1720 (void) mutex_unlock(&mdmn_busy_mutex[setno]);
1721 1721 }
1722 1722 continue;
1723 1723 }
1724 1724
1725 1725
1726 1726 /* fake sender, so we get notified when the results are avail */
1727 1727 cmsg->msg_sender = set_master;
1728 1728 /*
1729 1729 * register to the master_table. It's needed by wakeup_master to
1730 1730 * wakeup the sleeping thread.
1731 1731 * Access is protected by the class lock: mdmn_mark_class_busy()
1732 1732 */
1733 1733 mdmn_set_master_table_id(setno, class, &(cmsg->msg_msgid));
1734 1734
1735 1735
1736 1736
1737 1737 (void) rw_rdlock(&set_desc_rwlock[setno]);
1738 1738 /* Send the message to all other nodes */
1739 1739 for (node = set_descriptor[setno]->sd_nodelist; node;
1740 1740 node = node->nd_next) {
1741 1741 md_mn_nodeid_t nid = node->nd_nodeid;
1742 1742
1743 1743 /* We are master and have already processed the msg */
1744 1744 if (node == set_descriptor[setno]->sd_mn_masternode) {
1745 1745 continue;
1746 1746 }
1747 1747
1748 1748 /* If this node didn't join the disk set, ignore it */
1749 1749 if ((node->nd_flags & MD_MN_NODE_OWN) == 0) {
1750 1750 continue;
1751 1751 }
1752 1752
1753 1753 /* If a DIRECTED message, skip non-recipient nodes */
1754 1754 if ((cmsg->msg_flags & MD_MSGF_DIRECTED) &&
1755 1755 nid != cmsg->msg_recipient) {
1756 1756 continue;
1757 1757 }
1758 1758
1759 1759 (void) mutex_lock(mx);
1760 1760 /*
1761 1761 * Register the node that is addressed,
1762 1762 * so we can detect unsolicited messages
1763 1763 */
1764 1764 mdmn_set_master_table_addr(setno, class, nid);
1765 1765 slave_result = (md_mn_result_t *)NULL;
1766 1766
1767 1767 /*
1768 1768 * Now send it. do_send_message() will return if
1769 1769 * a failure occurs or
1770 1770 * the results are available
1771 1771 */
1772 1772 err = do_send_message(cmsg, node);
1773 1773
1774 1774 /* in abort state, we error out immediately */
1775 1775 if (md_commd_global_state & MD_CGS_ABORTED) {
1776 1776 break;
1777 1777 }
1778 1778
1779 1779 if (err == MDMNE_ACK) {
1780 1780 slave_result =
1781 1781 mdmn_get_master_table_res(setno, class);
1782 1782 commd_debug(MD_MMV_PROC_M,
1783 1783 "proc_mas: got result for (%d,0x%llx-%d)\n",
1784 1784 MSGID_ELEMS(cmsg->msg_msgid));
1785 1785 } else if (err == MDMNE_IGNORE_NODE) {
1786 1786 (void) mutex_unlock(mx);
1787 1787 continue; /* send to next node */
1788 1788 }
1789 1789 (void) mutex_unlock(mx);
1790 1790
1791 1791
1792 1792 /*
1793 1793 * If the result is NULL, or err doesn't show success,
1794 1794 * something went wrong with this RPC call.
1795 1795 */
1796 1796 if ((slave_result == NULL) || (err != MDMNE_ACK)) {
1797 1797 /*
1798 1798 * If PANIC_WHEN_INCONSISTENT set,
1799 1799 * panic if the master succeeded while
1800 1800 * this node failed
1801 1801 */
1802 1802 if ((cmsg->msg_flags &
1803 1803 MD_MSGF_PANIC_WHEN_INCONSISTENT) &&
1804 1804 (master_err == MDMNE_ACK))
1805 1805 panic_system(nid, cmsg->msg_type,
1806 1806 master_err, master_exitval,
1807 1807 slave_result);
1808 1808
1809 1809 result->mmr_failing_node = nid;
1810 1810 /* are we supposed to stop in case of error? */
1811 1811 if (cmsg->msg_flags & MD_MSGF_STOP_ON_ERROR) {
1812 1812 result->mmr_exitval = MDMNE_RPC_FAIL;
1813 1813 commd_debug(MD_MMV_SYSLOG, "proc_mas: "
1814 1814 "result (%d,0x%llx-%d) is NULL\n",
1815 1815 MSGID_ELEMS(cmsg->msg_msgid));
1816 1816 FLUSH_DEBUGFILE();
1817 1817 break_msg_loop = 1;
1818 1818 break; /* out of node loop first */
1819 1819 } else {
1820 1820 /* send msg to the next node */
1821 1821 continue;
1822 1822 }
1823 1823
1824 1824 }
1825 1825
1826 1826 /*
1827 1827 * Message processed on remote node.
1828 1828 * If PANIC_WHEN_INCONSISTENT set, panic if the
1829 1829 * result is different on this node from the result
1830 1830 * on the master
1831 1831 */
1832 1832 if ((cmsg->msg_flags &
1833 1833 MD_MSGF_PANIC_WHEN_INCONSISTENT) &&
1834 1834 ((master_err != MDMNE_ACK) ||
1835 1835 (slave_result->mmr_exitval != master_exitval)))
1836 1836 panic_system(nid, cmsg->msg_type, master_err,
1837 1837 master_exitval, slave_result);
1838 1838
1839 1839 /*
1840 1840 * At this point we know we have a message that was
1841 1841 * processed on the remote node.
1842 1842 * We now check if the exitval is non zero.
1843 1843 * In that case we discard the previous result and
1844 1844 * rather use the current.
1845 1845 * This means: If a message fails on no node,
1846 1846 * the result from the master will be returned.
1847 1847 * There's currently no such thing as merge of results
1848 1848 * If additionally STOP_ON_ERROR is set, we bail out
1849 1849 */
1850 1850 if (slave_result->mmr_exitval != 0) {
1851 1851 /* throw away the previously allocated result */
1852 1852 free_result(result);
1853 1853
1854 1854 /* copy_result() allocates new memory */
1855 1855 result = copy_result(slave_result);
1856 1856 free_result(slave_result);
1857 1857
1858 1858 dump_result(MD_MMV_PROC_M, "proc_mas", result);
1859 1859
1860 1860 result->mmr_failing_node = nid;
1861 1861 if (cmsg->msg_flags & MD_MSGF_STOP_ON_ERROR) {
1862 1862 break_msg_loop = 1;
1863 1863 break; /* out of node loop */
1864 1864 }
1865 1865 continue; /* try next node */
1866 1866
1867 1867 } else {
1868 1868 /*
1869 1869 * MNIssue: may want to merge the results
1870 1870 * from all slaves. Currently only report
1871 1871 * the results from the master.
1872 1872 */
1873 1873 free_result(slave_result);
1874 1874 }
1875 1875
1876 1876 } /* End of loop over the nodes */
1877 1877 (void) rw_unlock(&set_desc_rwlock[setno]);
1878 1878
1879 1879
1880 1880 /* release the current class again */
1881 1881 if (class != orig_class) {
1882 1882 (void) mutex_lock(&mdmn_busy_mutex[setno]);
1883 1883 mdmn_mark_class_unbusy(setno, class);
1884 1884 (void) mutex_unlock(&mdmn_busy_mutex[setno]);
1885 1885 }
1886 1886
1887 1887 /* are we supposed to quit entirely ? */
1888 1888 if (break_msg_loop ||
1889 1889 (md_commd_global_state & MD_CGS_ABORTED)) {
1890 1890 break; /* out of msg loop */
1891 1891 }
1892 1892
1893 1893 } /* End of loop over the messages */
1894 1894 /*
1895 1895 * If we are here, there's two possibilities:
1896 1896 * - we processed all messages on all nodes without an error.
1897 1897 * In this case we return the result from the master.
1898 1898 * (to be implemented: return the merged result)
1899 1899 * - we encountered an error in which case result has been
1900 1900 * set accordingly already.
1901 1901 */
1902 1902
1903 1903 if (md_commd_global_state & MD_CGS_ABORTED) {
1904 1904 result->mmr_comm_state = MDMNE_ABORT;
1905 1905 }
1906 1906
1907 1907 /*
1908 1908 * This message has been processed completely.
1909 1909 * Remove it from the changelog.
1910 1910 * Do this for replay messages too.
1911 1911 * Note that the message is unlogged before waking up the
1912 1912 * initiator. This is done for two reasons.
1913 1913 * 1. Remove a race condition that occurs when back to back
1914 1914 * messages are sent for the same class, the registeration is
1915 1915 * is lost.
1916 1916 * 2. If the initiator died but the action was completed on all the
1917 1917 * the nodes, we want that to be marked "done" quickly.
1918 1918 */
1919 1919
1920 1920 if ((msg->msg_flags & MD_MSGF_NO_LOG) == 0) {
1921 1921 commd_debug(MD_MMV_PROC_M,
1922 1922 "proc_mas: calling unlog_msg for (%d,0x%llx-%d) type %d\n",
1923 1923 MSGID_ELEMS(msg->msg_msgid), msgtype);
1924 1924 (void) mdmn_unlog_msg(msg);
1925 1925 commd_debug(MD_MMV_PROC_M,
1926 1926 "proc_mas: done unlog_msg for (%d,0x%llx-%d) type %d\n",
1927 1927 MSGID_ELEMS(msg->msg_msgid), msgtype);
1928 1928 }
1929 1929
1930 1930 /*
1931 1931 * In case of submessages, we increased the submessage ID in the
1932 1932 * result structure. We restore the message ID to the value that
1933 1933 * the initiator is waiting for.
1934 1934 */
1935 1935 result->mmr_msgid.mid_smid = 0;
1936 1936 result->mmr_msgtype = orig_type;
1937 1937 result->mmr_sender = set_master;
1938 1938
1939 1939 /* if we have an inited client, send result */
1940 1940 ret = (int *)NULL;
1941 1941
1942 1942 (void) rw_rdlock(&client_rwlock[setno]);
1943 1943 if (check_client(setno, sender)) {
1944 1944 commd_debug(MD_MMV_SYSLOG,
1945 1945 "proc_mas: unable to create client for initiator\n");
1946 1946 } else {
1947 1947 ret = mdmn_wakeup_initiator_2(result, client[setno][sender],
1948 1948 sender);
1949 1949 }
1950 1950 (void) rw_unlock(&client_rwlock[setno]);
1951 1951
1952 1952 if (ret == (int *)NULL) {
1953 1953 commd_debug(MD_MMV_PROC_M,
1954 1954 "proc_mas: couldn't wakeup initiator\n");
1955 1955 } else {
1956 1956 if (*ret != MDMNE_ACK) {
1957 1957 commd_debug(MD_MMV_PROC_M,
1958 1958 "proc_mas: wakeup_initiator returned %d\n",
1959 1959 *ret);
1960 1960 }
1961 1961 free(ret);
1962 1962 }
1963 1963
1964 1964 (void) rw_unlock(&set_desc_rwlock[setno]);
1965 1965 /* Free all submessages, if there were any */
1966 1966 if (nmsgs > 1) {
1967 1967 for (curmsg = 0; curmsg < nmsgs; curmsg++) {
1968 1968 free_msg(msglist[curmsg]);
1969 1969 }
1970 1970 }
1971 1971 /* Free the result */
1972 1972 free_result(result);
1973 1973
1974 1974 (void) mutex_lock(&mdmn_busy_mutex[setno]);
1975 1975 mdmn_mark_class_unbusy(setno, orig_class);
1976 1976 (void) mutex_unlock(&mdmn_busy_mutex[setno]);
1977 1977
1978 1978
1979 1979 /*
1980 1980 * We use this ioctl just to get the time in the same format as used in
1981 1981 * the messageID. If it fails, all we get is a bad runtime output.
1982 1982 */
1983 1983 (void) metaioctl(MD_IOCGUNIQMSGID, &dummyid, &mde, NULL);
1984 1984 secdiff = (dummyid.mid_time - msg->msg_msgid.mid_time) >> 32;
1985 1985 usecdiff = (dummyid.mid_time - msg->msg_msgid.mid_time) & 0xfffff;
1986 1986
1987 1987 /* catching possible overflow */
1988 1988 if (usecdiff >= 1000000) {
1989 1989 usecdiff -= 1000000;
1990 1990 secdiff++;
1991 1991 }
1992 1992
1993 1993
1994 1994 commd_debug(MD_MMV_PROC_M, "proc_mas: done (%d, 0x%llx-%d) type=%02d "
1995 1995 "%5d.%06d secs runtime\n",
1996 1996 MSGID_ELEMS(msg->msg_msgid), orig_type, secdiff, usecdiff);
1997 1997
1998 1998 /* Free the original message */
1999 1999 free_msg(msg);
2000 2000 }
2001 2001
2002 2002 void
2003 2003 mdmn_slave_process_msg(md_mn_msg_t *msg)
2004 2004 {
2005 2005 int *ret = NULL;
2006 2006 int completed;
2007 2007 int retries;
2008 2008 int successfully_returned;
2009 2009 set_t setno;
2010 2010 md_mn_result_t *result;
2011 2011 md_mn_nodeid_t sender;
2012 2012 md_mn_nodeid_t whoami;
2013 2013 md_mn_msgtype_t msgtype;
2014 2014 md_mn_msgclass_t class;
2015 2015
2016 2016 void (*handler)(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *res);
2017 2017
2018 2018 setno = msg->msg_setno;
2019 2019 sender = msg->msg_sender; /* this is always the master of the set */
2020 2020 msgtype = msg->msg_type;
2021 2021
2022 2022 (void) rw_rdlock(&set_desc_rwlock[setno]);
2023 2023 whoami = set_descriptor[setno]->sd_mn_mynode->nd_nodeid;
2024 2024 (void) rw_unlock(&set_desc_rwlock[setno]);
2025 2025
2026 2026 result = Zalloc(sizeof (md_mn_result_t));
2027 2027 result->mmr_flags = msg->msg_flags;
2028 2028 result->mmr_setno = setno;
2029 2029 result->mmr_msgtype = msgtype;
2030 2030 result->mmr_sender = whoami;
2031 2031 result->mmr_comm_state = MDMNE_ACK; /* Ok state */
2032 2032 MSGID_COPY(&(msg->msg_msgid), &(result->mmr_msgid));
2033 2033 class = mdmn_get_message_class(msgtype);
2034 2034
2035 2035 commd_debug(MD_MMV_PROC_S,
2036 2036 "proc_sla: received (%d, 0x%llx-%d) set=%d, class=%d, type=%d\n",
2037 2037 MSGID_ELEMS(msg->msg_msgid), setno, class, msgtype);
2038 2038
2039 2039 handler = mdmn_get_handler(msgtype);
2040 2040
2041 2041 if (handler == NULL) {
2042 2042 result->mmr_exitval = 0;
2043 2043 /* let the sender decide if this is an error or not */
2044 2044 result->mmr_comm_state = MDMNE_NO_HANDLER;
2045 2045 commd_debug(MD_MMV_PROC_S,
2046 2046 "proc_sla: No handler for (%d, 0x%llx-%d)\n",
2047 2047 MSGID_ELEMS(msg->msg_msgid));
2048 2048 } else {
2049 2049
2050 2050 /* Did we already process this message ? */
2051 2051 (void) mutex_lock(&mct_mutex[setno][class]);
2052 2052 completed = mdmn_check_completion(msg, result);
2053 2053
2054 2054 if (completed == MDMN_MCT_NOT_DONE) {
2055 2055 /* message not yet processed locally */
2056 2056 commd_debug(MD_MMV_PROC_S,
2057 2057 "proc_sla: calling handler for (%d, 0x%llx-%d)\n",
2058 2058 MSGID_ELEMS(msg->msg_msgid));
2059 2059
2060 2060 /*
2061 2061 * Mark the message as being currently processed,
2062 2062 * so we won't start a second handler for it
2063 2063 */
2064 2064 (void) mdmn_mark_completion(msg, NULL,
2065 2065 MDMN_MCT_IN_PROGRESS);
2066 2066
2067 2067 (void) mutex_unlock(&mct_mutex[setno][class]);
2068 2068 (*handler)(msg, MD_MSGF_ON_SLAVE, result);
2069 2069
2070 2070 commd_debug(MD_MMV_PROC_S,
2071 2071 "proc_sla: finished handler for (%d, 0x%llx-%d)\n",
2072 2072 MSGID_ELEMS(msg->msg_msgid));
2073 2073
2074 2074 (void) mutex_lock(&mct_mutex[setno][class]);
2075 2075 /* Mark the message as fully done, store the result */
2076 2076 (void) mdmn_mark_completion(msg, result, MDMN_MCT_DONE);
2077 2077
2078 2078 } else if (completed == MDMN_MCT_DONE) {
2079 2079 /* message processed previously, got result from MCT */
2080 2080 commd_debug(MD_MMV_PROC_S,
2081 2081 "proc_sla: result for (%d, 0x%llx-%d) from MCT\n",
2082 2082 MSGID_ELEMS(msg->msg_msgid));
2083 2083 } else if (completed == MDMN_MCT_IN_PROGRESS) {
2084 2084 /*
2085 2085 * If the message is curruntly being processed,
2086 2086 * we can return here, without sending a result back.
2087 2087 * This will be done by the initial message handling
2088 2088 * thread
2089 2089 */
2090 2090 (void) mutex_unlock(&mct_mutex[setno][class]);
2091 2091 commd_debug(MD_MMV_PROC_M, "proc_sla: "
2092 2092 "(%d, 0x%llx-%d) is currently being processed\n",
2093 2093 MSGID_ELEMS(msg->msg_msgid), msgtype);
2094 2094
2095 2095 free_msg(msg);
2096 2096 free_result(result);
2097 2097 return;
2098 2098 } else {
2099 2099 /* MCT error occurred (should never happen) */
2100 2100 result->mmr_comm_state = MDMNE_LOG_FAIL;
2101 2101 commd_debug(MD_MMV_PROC_S,
2102 2102 "proc_sla: MCT error for (%d, 0x%llx-%d)\n",
2103 2103 MSGID_ELEMS(msg->msg_msgid));
2104 2104 }
2105 2105 (void) mutex_unlock(&mct_mutex[setno][class]);
2106 2106 }
2107 2107
2108 2108 /*
2109 2109 * At this point we have a result (even in an error case)
2110 2110 * that we return to the master.
2111 2111 */
2112 2112 (void) rw_rdlock(&set_desc_rwlock[setno]);
2113 2113 retries = 2; /* we will try two times to send the results */
2114 2114 successfully_returned = 0;
2115 2115
2116 2116 while (!successfully_returned && (retries != 0)) {
2117 2117 ret = (int *)NULL;
2118 2118 (void) rw_rdlock(&client_rwlock[setno]);
2119 2119 if (check_client(setno, sender)) {
2120 2120 /*
2121 2121 * If we cannot setup the rpc connection to the master,
2122 2122 * we can't do anything besides logging this fact.
2123 2123 */
2124 2124 commd_debug(MD_MMV_SYSLOG,
2125 2125 "proc_mas: unable to create client for master\n");
2126 2126 (void) rw_unlock(&client_rwlock[setno]);
2127 2127 break;
2128 2128 } else {
2129 2129 ret = mdmn_wakeup_master_2(result,
2130 2130 client[setno][sender], sender);
2131 2131 /*
2132 2132 * if mdmn_wakeup_master_2 returns NULL, it can be that
2133 2133 * the master (or the commd on the master) had died.
2134 2134 * In that case, we destroy the client to the master
2135 2135 * and retry.
2136 2136 * If mdmn_wakeup_master_2 doesn't return MDMNE_ACK,
2137 2137 * the commd on the master is alive but
2138 2138 * something else is wrong,
2139 2139 * in that case a retry doesn't make sense => break out
2140 2140 */
2141 2141 if (ret == (int *)NULL) {
2142 2142 commd_debug(MD_MMV_PROC_S,
2143 2143 "proc_sla: wakeup_master returned NULL\n");
2144 2144 /* release reader lock, grab writer lock */
2145 2145 (void) rw_unlock(&client_rwlock[setno]);
2146 2146 (void) rw_wrlock(&client_rwlock[setno]);
2147 2147 mdmn_clnt_destroy(client[setno][sender]);
2148 2148 if (client[setno][sender] != (CLIENT *)NULL) {
2149 2149 client[setno][sender] = (CLIENT *)NULL;
2150 2150 }
2151 2151 (void) rw_unlock(&client_rwlock[setno]);
2152 2152 retries--;
2153 2153 commd_debug(MD_MMV_PROC_S,
2154 2154 "retries = %d\n", retries);
2155 2155 continue;
2156 2156 }
2157 2157 if (*ret != MDMNE_ACK) {
2158 2158 commd_debug(MD_MMV_PROC_S, "proc_sla: "
2159 2159 "wakeup_master returned %d\n", *ret);
2160 2160 (void) rw_unlock(&client_rwlock[setno]);
2161 2161 break;
2162 2162 } else { /* Good case */
2163 2163 successfully_returned = 1;
2164 2164 (void) rw_unlock(&client_rwlock[setno]);
2165 2165 }
2166 2166 }
2167 2167 }
2168 2168
2169 2169 (void) rw_unlock(&set_desc_rwlock[setno]);
2170 2170 commd_debug(MD_MMV_PROC_S, "proc_sla: done (%d, 0x%llx-%d)\n",
2171 2171 MSGID_ELEMS(msg->msg_msgid));
2172 2172
2173 2173 if (ret != (int *)NULL)
2174 2174 free(ret);
2175 2175 free_msg(msg);
2176 2176 free_result(result);
2177 2177 }
2178 2178
2179 2179
2180 2180 /*
2181 2181 * mdmn_send_svc_2:
2182 2182 * ---------------
2183 2183 * Check that the issuing node is a legitimate one (i.e. is licensed to send
2184 2184 * messages to us), that the RPC request can be staged.
2185 2185 *
2186 2186 * Returns:
2187 2187 * 0 => no RPC request is in-flight, no deferred svc_sendreply()
2188 2188 * 1 => queued RPC request in-flight. Completion will be made (later)
2189 2189 * by a wakeup_initiator_2() [hopefully]
2190 2190 */
2191 2191 int
2192 2192 mdmn_send_svc_2(md_mn_msg_t *omsg, struct svc_req *rqstp)
2193 2193 {
2194 2194 int err;
2195 2195 set_t setno;
2196 2196 SVCXPRT *transp = rqstp->rq_xprt;
2197 2197 md_mn_msg_t *msg;
2198 2198 md_mn_result_t *resultp;
2199 2199 md_mn_msgclass_t class;
2200 2200 md_mn_msg_and_transp_t *matp;
2201 2201
2202 2202 msg = copy_msg(omsg, NULL);
2203 2203 xdr_free(xdr_md_mn_msg_t, (caddr_t)omsg);
2204 2204
2205 2205 setno = msg->msg_setno;
2206 2206 class = mdmn_get_message_class(msg->msg_type);
2207 2207
2208 2208 /* If we are in the abort state, we error out immediately */
2209 2209 if (md_commd_global_state & MD_CGS_ABORTED) {
2210 2210 resultp = Zalloc(sizeof (md_mn_result_t));
2211 2211 resultp->mmr_comm_state = MDMNE_ABORT;
2212 2212 mdmn_svc_sendreply(transp, xdr_md_mn_result_t, (char *)resultp);
2213 2213 free_result(resultp);
2214 2214 svc_freeargs(transp, xdr_md_mn_msg_t, (caddr_t)msg);
2215 2215 return (0);
2216 2216 }
2217 2217
2218 2218 /* check if the global initialization is done */
2219 2219 if ((md_commd_global_state & MD_CGS_INITED) == 0) {
2220 2220 global_init();
2221 2221 }
2222 2222
2223 2223 commd_debug(MD_MMV_SEND,
2224 2224 "send: received (%d, 0x%llx-%d), set=%d, class=%d, type=%d\n",
2225 2225 MSGID_ELEMS(msg->msg_msgid), setno, class, msg->msg_type);
2226 2226
2227 2227 /* Check for verbosity related message */
2228 2228 if (msg->msg_type == MD_MN_MSG_VERBOSITY) {
2229 2229 md_mn_verbose_t *d;
2230 2230
2231 2231 d = (md_mn_verbose_t *)((void *)(msg->msg_event_data));
2232 2232 md_commd_global_verb = d->mmv_what;
2233 2233 /* everytime the bitmask is set, we reset the timer */
2234 2234 __savetime = gethrtime();
2235 2235 /*
2236 2236 * If local-only-flag is set, we are done here,
2237 2237 * otherwise we pass that message on to the master.
2238 2238 */
2239 2239 if (msg->msg_flags & MD_MSGF_LOCAL_ONLY) {
2240 2240 resultp = Zalloc(sizeof (md_mn_result_t));
2241 2241 resultp->mmr_comm_state = MDMNE_ACK;
2242 2242 mdmn_svc_sendreply(transp, xdr_md_mn_result_t,
2243 2243 (char *)resultp);
2244 2244 free_result(resultp);
2245 2245 svc_freeargs(transp, xdr_md_mn_msg_t, (caddr_t)msg);
2246 2246 return (0);
2247 2247 }
2248 2248 }
2249 2249
2250 2250 /*
2251 2251 * Are we entering the abort state?
2252 2252 * Here we don't even need to check for MD_MSGF_LOCAL_ONLY, because
2253 2253 * this message cannot be distributed anyway.
2254 2254 * So, it's safe to return immediately.
2255 2255 */
2256 2256 if (msg->msg_type == MD_MN_MSG_ABORT) {
2257 2257 md_commd_global_state |= MD_CGS_ABORTED;
2258 2258 resultp = Zalloc(sizeof (md_mn_result_t));
2259 2259 resultp->mmr_comm_state = MDMNE_ACK;
2260 2260 mdmn_svc_sendreply(transp, xdr_md_mn_result_t, (char *)resultp);
2261 2261 free_result(resultp);
2262 2262 svc_freeargs(transp, xdr_md_mn_msg_t, (caddr_t)msg);
2263 2263 return (0);
2264 2264 }
2265 2265
2266 2266
2267 2267 /*
2268 2268 * Is this message type blocked?
2269 2269 * If so we return MDMNE_CLASS_LOCKED, immediately
2270 2270 */
2271 2271 if (msgtype_lock_state[msg->msg_type] == MMTL_LOCK) {
2272 2272 resultp = Zalloc(sizeof (md_mn_result_t));
2273 2273 resultp->mmr_comm_state = MDMNE_CLASS_LOCKED;
2274 2274 mdmn_svc_sendreply(transp, xdr_md_mn_result_t, (char *)resultp);
2275 2275 free_result(resultp);
2276 2276 svc_freeargs(transp, xdr_md_mn_msg_t, (caddr_t)msg);
2277 2277 commd_debug(MD_MMV_SEND,
2278 2278 "send: type locked (%d, 0x%llx-%d), set=%d, class=%d, "
2279 2279 "type=%d\n", MSGID_ELEMS(msg->msg_msgid), setno, class,
2280 2280 msg->msg_type);
2281 2281 return (0);
2282 2282 }
2283 2283
2284 2284
2285 2285 if (md_mn_set_inited[setno] != MDMN_SET_READY) {
2286 2286 /* Can only use the appropriate mutexes if they are inited */
2287 2287 if (md_mn_set_inited[setno] & MDMN_SET_MUTEXES) {
2288 2288 (void) rw_wrlock(&set_desc_rwlock[setno]);
2289 2289 (void) rw_wrlock(&client_rwlock[setno]);
2290 2290 err = mdmn_init_set(setno, MDMN_SET_READY);
2291 2291 (void) rw_unlock(&client_rwlock[setno]);
2292 2292 (void) rw_unlock(&set_desc_rwlock[setno]);
2293 2293 } else {
2294 2294 err = mdmn_init_set(setno, MDMN_SET_READY);
2295 2295 }
2296 2296
2297 2297 if (err) {
2298 2298 /* couldn't initialize connections, cannot proceed */
2299 2299 resultp = Zalloc(sizeof (md_mn_result_t));
2300 2300 resultp->mmr_comm_state = err;
2301 2301 mdmn_svc_sendreply(transp, xdr_md_mn_result_t,
2302 2302 (char *)resultp);
2303 2303 svc_freeargs(transp, xdr_md_mn_msg_t, (caddr_t)msg);
2304 2304 free_result(resultp);
2305 2305 commd_debug(MD_MMV_SEND,
2306 2306 "send: init err = %d\n", err);
2307 2307 return (0);
2308 2308 }
2309 2309 }
2310 2310
2311 2311 (void) mutex_lock(&mdmn_busy_mutex[setno]);
2312 2312 if ((mdmn_is_class_suspended(setno, class) == TRUE) &&
2313 2313 ((msg->msg_flags & MD_MSGF_OVERRIDE_SUSPEND) == 0)) {
2314 2314 (void) mutex_unlock(&mdmn_busy_mutex[setno]);
2315 2315 resultp = Zalloc(sizeof (md_mn_result_t));
2316 2316 resultp->mmr_comm_state = MDMNE_SUSPENDED;
2317 2317 mdmn_svc_sendreply(transp, xdr_md_mn_result_t, (char *)resultp);
2318 2318 svc_freeargs(transp, xdr_md_mn_msg_t, (caddr_t)msg);
2319 2319 free_result(resultp);
2320 2320 commd_debug(MD_MMV_SEND,
2321 2321 "send: class suspended (%d, 0x%llx-%d), set=%d, "
2322 2322 "class=%d, type=%d\n", MSGID_ELEMS(msg->msg_msgid),
2323 2323 setno, class, msg->msg_type);
2324 2324 return (0);
2325 2325 }
2326 2326 (void) mutex_unlock(&mdmn_busy_mutex[setno]);
2327 2327
2328 2328 /* is this rpc request coming from the local node? */
2329 2329 if (check_license(rqstp, 0) == FALSE) {
2330 2330 svc_freeargs(transp, xdr_md_mn_msg_t, (caddr_t)msg);
2331 2331 commd_debug(MD_MMV_SEND,
2332 2332 "send: check licence fail(%d, 0x%llx-%d), set=%d, "
2333 2333 "class=%d, type=%d\n", MSGID_ELEMS(msg->msg_msgid),
2334 2334 setno, class, msg->msg_type);
2335 2335 return (0);
2336 2336 }
2337 2337
2338 2338
2339 2339 /*
2340 2340 * We allocate a structure that can take two pointers in order to pass
2341 2341 * both the message and the transp into thread_create.
2342 2342 * The free for this alloc is done in mdmn_send_to_work()
2343 2343 */
2344 2344 matp = Malloc(sizeof (md_mn_msg_and_transp_t));
2345 2345 matp->mat_msg = msg;
2346 2346 matp->mat_transp = transp;
2347 2347
2348 2348 /*
2349 2349 * create a thread here that calls work on the master.
2350 2350 * If we are already on the master, this would block if running
2351 2351 * in the same context. (our service is single threaded)(
2352 2352 * Make it a detached thread because it will not communicate with
2353 2353 * anybody thru thr_* mechanisms
2354 2354 */
2355 2355 (void) thr_create(NULL, 0, mdmn_send_to_work, (void *) matp,
2356 2356 THR_DETACHED, NULL);
2357 2357
2358 2358 commd_debug(MD_MMV_SEND, "send: done (%d, 0x%llx-%d)\n",
2359 2359 MSGID_ELEMS(msg->msg_msgid));
2360 2360 /*
2361 2361 * We return here without sending results. This will be done by
2362 2362 * mdmn_wakeup_initiator_svc_2() as soon as the results are available.
2363 2363 * Until then the calling send_message will be blocked, while we
2364 2364 * are able to take calls.
2365 2365 */
2366 2366
2367 2367 return (1);
2368 2368 }
2369 2369
2370 2370 /* ARGSUSED */
2371 2371 int *
2372 2372 mdmn_work_svc_2(md_mn_msg_t *omsg, struct svc_req *rqstp)
2373 2373 {
2374 2374 int err;
2375 2375 set_t setno;
2376 2376 thread_t tid;
2377 2377 int *retval;
2378 2378 md_mn_msg_t *msg;
2379 2379 md_mn_msgclass_t class;
2380 2380
2381 2381 retval = Malloc(sizeof (int));
2382 2382
2383 2383 /* If we are in the abort state, we error out immediately */
2384 2384 if (md_commd_global_state & MD_CGS_ABORTED) {
2385 2385 xdr_free(xdr_md_mn_msg_t, (caddr_t)omsg);
2386 2386 *retval = MDMNE_ABORT;
2387 2387 return (retval);
2388 2388 }
2389 2389
2390 2390 msg = copy_msg(omsg, NULL);
2391 2391 xdr_free(xdr_md_mn_msg_t, (caddr_t)omsg);
2392 2392
2393 2393 /*
2394 2394 * Is this message type blocked?
2395 2395 * If so we return MDMNE_CLASS_LOCKED, immediately.
2396 2396 * This check is performed on master and slave.
2397 2397 */
2398 2398 if (msgtype_lock_state[msg->msg_type] == MMTL_LOCK) {
2399 2399 *retval = MDMNE_CLASS_LOCKED;
2400 2400 return (retval);
2401 2401 }
2402 2402
2403 2403 /* check if the global initialization is done */
2404 2404 if ((md_commd_global_state & MD_CGS_INITED) == 0) {
2405 2405 global_init();
2406 2406 }
2407 2407
2408 2408 class = mdmn_get_message_class(msg->msg_type);
2409 2409 setno = msg->msg_setno;
2410 2410
2411 2411 if (md_mn_set_inited[setno] != MDMN_SET_READY) {
2412 2412 /* Can only use the appropriate mutexes if they are inited */
2413 2413 if (md_mn_set_inited[setno] & MDMN_SET_MUTEXES) {
2414 2414 (void) rw_wrlock(&set_desc_rwlock[setno]);
2415 2415 (void) rw_wrlock(&client_rwlock[setno]);
2416 2416 err = mdmn_init_set(setno, MDMN_SET_READY);
2417 2417 (void) rw_unlock(&client_rwlock[setno]);
2418 2418 (void) rw_unlock(&set_desc_rwlock[setno]);
2419 2419 } else {
2420 2420 err = mdmn_init_set(setno, MDMN_SET_READY);
2421 2421 }
2422 2422
2423 2423 if (err) {
2424 2424 *retval = MDMNE_CANNOT_CONNECT;
2425 2425 free_msg(msg);
2426 2426 return (retval);
2427 2427 }
2428 2428 }
2429 2429
2430 2430 /* is this rpc request coming from a licensed node? */
2431 2431 if (check_license(rqstp, msg->msg_sender) == FALSE) {
2432 2432 free_msg(msg);
2433 2433 *retval = MDMNE_RPC_FAIL;
2434 2434 return (retval);
2435 2435 }
2436 2436
2437 2437 commd_debug(MD_MMV_WORK,
2438 2438 "work: received (%d, 0x%llx-%d), set=%d, class=%d, type=%d, "
2439 2439 "flags=0x%x\n",
2440 2440 MSGID_ELEMS(msg->msg_msgid), setno, class, msg->msg_type,
2441 2441 msg->msg_flags);
2442 2442
2443 2443 /* Check for various CLASS0 message types */
2444 2444 if (msg->msg_type == MD_MN_MSG_VERBOSITY) {
2445 2445 md_mn_verbose_t *d;
2446 2446
2447 2447 d = (md_mn_verbose_t *)((void *)(msg->msg_event_data));
2448 2448 /* for now we ignore set / class in md_mn_verbose_t */
2449 2449 md_commd_global_verb = d->mmv_what;
2450 2450 /* everytime the bitmask is set, we reset the timer */
2451 2451 __savetime = gethrtime();
2452 2452 }
2453 2453
2454 2454 (void) mutex_lock(&mdmn_busy_mutex[setno]);
2455 2455
2456 2456 /* check if class is locked via a call to mdmn_comm_lock_svc_2 */
2457 2457 if (mdmn_is_class_locked(setno, class) == TRUE) {
2458 2458 (void) mutex_unlock(&mdmn_busy_mutex[setno]);
2459 2459 *retval = MDMNE_CLASS_LOCKED;
2460 2460 free_msg(msg);
2461 2461 return (retval);
2462 2462 }
2463 2463 (void) mutex_unlock(&mdmn_busy_mutex[setno]);
2464 2464
2465 2465 /* Check if the class is busy right now. Do it only on the master */
2466 2466 (void) rw_rdlock(&set_desc_rwlock[setno]);
2467 2467 if (set_descriptor[setno]->sd_mn_am_i_master) {
2468 2468 (void) rw_unlock(&set_desc_rwlock[setno]);
2469 2469 /*
2470 2470 * If the class is currently suspended, don't accept new
2471 2471 * messages, unless they are flagged with an override bit.
2472 2472 */
2473 2473 (void) mutex_lock(&mdmn_busy_mutex[setno]);
2474 2474 if ((mdmn_is_class_suspended(setno, class) == TRUE) &&
2475 2475 ((msg->msg_flags & MD_MSGF_OVERRIDE_SUSPEND) == 0)) {
2476 2476 (void) mutex_unlock(&mdmn_busy_mutex[setno]);
2477 2477 *retval = MDMNE_SUSPENDED;
2478 2478 commd_debug(MD_MMV_SEND,
2479 2479 "send: set %d is suspended\n", setno);
2480 2480 free_msg(msg);
2481 2481 return (retval);
2482 2482 }
2483 2483 if (mdmn_mark_class_busy(setno, class) == FALSE) {
2484 2484 (void) mutex_unlock(&mdmn_busy_mutex[setno]);
2485 2485 *retval = MDMNE_CLASS_BUSY;
2486 2486 free_msg(msg);
2487 2487 return (retval);
2488 2488 }
2489 2489 (void) mutex_unlock(&mdmn_busy_mutex[setno]);
2490 2490 /*
2491 2491 * Because the real processing of the message takes time we
2492 2492 * create a thread for it. So the master thread can continue
2493 2493 * to run and accept further messages.
2494 2494 */
2495 2495 *retval = thr_create(NULL, 0,
2496 2496 (void *(*)(void *))mdmn_master_process_msg, (void *)msg,
2497 2497 THR_DETACHED|THR_SUSPENDED, &tid);
2498 2498 } else {
2499 2499 (void) rw_unlock(&set_desc_rwlock[setno]);
2500 2500 *retval = thr_create(NULL, 0,
2501 2501 (void *(*)(void *)) mdmn_slave_process_msg, (void *)msg,
2502 2502 THR_DETACHED|THR_SUSPENDED, &tid);
2503 2503 }
2504 2504
2505 2505 if (*retval != 0) {
2506 2506 *retval = MDMNE_THR_CREATE_FAIL;
2507 2507 free_msg(msg);
2508 2508 return (retval);
2509 2509 }
2510 2510
2511 2511 /* Now run the new thread */
2512 2512 (void) thr_continue(tid);
2513 2513
2514 2514 commd_debug(MD_MMV_WORK,
2515 2515 "work: done (%d, 0x%llx-%d), set=%d, class=%d, type=%d\n",
2516 2516 MSGID_ELEMS(msg->msg_msgid), setno, class, msg->msg_type);
2517 2517
2518 2518 *retval = MDMNE_ACK; /* this means success */
2519 2519 return (retval);
2520 2520 }
2521 2521
2522 2522 /* ARGSUSED */
2523 2523 int *
2524 2524 mdmn_wakeup_initiator_svc_2(md_mn_result_t *res, struct svc_req *rqstp)
2525 2525 {
2526 2526
2527 2527 int *retval;
2528 2528 int err;
2529 2529 set_t setno;
2530 2530 mutex_t *mx; /* protection of initiator_table */
2531 2531 SVCXPRT *transp = NULL;
2532 2532 md_mn_msgid_t initiator_table_id;
2533 2533 md_mn_msgclass_t class;
2534 2534
2535 2535 retval = Malloc(sizeof (int));
2536 2536
2537 2537 /* check if the global initialization is done */
2538 2538 if ((md_commd_global_state & MD_CGS_INITED) == 0) {
2539 2539 global_init();
2540 2540 }
2541 2541
2542 2542 setno = res->mmr_setno;
2543 2543
2544 2544 if (md_mn_set_inited[setno] != MDMN_SET_READY) {
2545 2545 /* set not ready means we just crashed are restarted now */
2546 2546 /* Can only use the appropriate mutexes if they are inited */
2547 2547 if (md_mn_set_inited[setno] & MDMN_SET_MUTEXES) {
2548 2548 (void) rw_wrlock(&set_desc_rwlock[setno]);
2549 2549 (void) rw_wrlock(&client_rwlock[setno]);
2550 2550 err = mdmn_init_set(setno, MDMN_SET_READY);
2551 2551 (void) rw_unlock(&client_rwlock[setno]);
2552 2552 (void) rw_unlock(&set_desc_rwlock[setno]);
2553 2553 } else {
2554 2554 err = mdmn_init_set(setno, MDMN_SET_READY);
2555 2555 }
2556 2556
2557 2557 if (err) {
2558 2558 *retval = MDMNE_CANNOT_CONNECT;
2559 2559 xdr_free(xdr_md_mn_result_t, (caddr_t)res);
2560 2560 return (retval);
2561 2561 }
2562 2562 }
2563 2563
2564 2564 /* is this rpc request coming from a licensed node? */
2565 2565 if (check_license(rqstp, res->mmr_sender) == FALSE) {
2566 2566 xdr_free(xdr_md_mn_result_t, (caddr_t)res);
2567 2567 *retval = MDMNE_RPC_FAIL;
2568 2568 return (retval);
2569 2569 }
2570 2570
2571 2571
2572 2572 class = mdmn_get_message_class(res->mmr_msgtype);
2573 2573 mx = mdmn_get_initiator_table_mx(setno, class);
2574 2574
2575 2575 commd_debug(MD_MMV_WAKE_I,
2576 2576 "wake_ini: received (%d, 0x%llx-%d) set=%d, class=%d, type=%d\n",
2577 2577 MSGID_ELEMS(res->mmr_msgid), setno, class, res->mmr_msgtype);
2578 2578
2579 2579 (void) mutex_lock(mx);
2580 2580
2581 2581 /*
2582 2582 * Search the initiator wakeup table.
2583 2583 * If we find an entry here (which should always be true)
2584 2584 * we are on the initiating node and we wakeup the original
2585 2585 * local rpc call.
2586 2586 */
2587 2587 mdmn_get_initiator_table_id(setno, class, &initiator_table_id);
2588 2588
2589 2589 if (MSGID_CMP(&(initiator_table_id), &(res->mmr_msgid))) {
2590 2590 transp = mdmn_get_initiator_table_transp(setno, class);
2591 2591 mdmn_svc_sendreply(transp, xdr_md_mn_result_t, (char *)res);
2592 2592 svc_done(transp);
2593 2593 mdmn_unregister_initiator_table(setno, class);
2594 2594 *retval = MDMNE_ACK;
2595 2595
2596 2596 commd_debug(MD_MMV_WAKE_I,
2597 2597 "wake_ini: replied (%d, 0x%llx-%d)\n",
2598 2598 MSGID_ELEMS(res->mmr_msgid));
2599 2599 } else {
2600 2600 commd_debug(MD_MMV_WAKE_I,
2601 2601 "wakeup initiator: unsolicited message (%d, 0x%llx-%d)\n",
2602 2602 MSGID_ELEMS(res->mmr_msgid));
2603 2603 *retval = MDMNE_NO_WAKEUP_ENTRY;
2604 2604 }
2605 2605 (void) mutex_unlock(mx);
2606 2606 /* less work for check_timeouts */
2607 2607 (void) mutex_lock(&check_timeout_mutex);
2608 2608 if (messages_on_their_way == 0) {
2609 2609 commd_debug(MD_MMV_WAKE_I,
2610 2610 "Oops, messages_on_their_way < 0 (%d, 0x%llx-%d)\n",
2611 2611 MSGID_ELEMS(res->mmr_msgid));
2612 2612 } else {
2613 2613 messages_on_their_way--;
2614 2614 }
2615 2615 (void) mutex_unlock(&check_timeout_mutex);
2616 2616 xdr_free(xdr_md_mn_result_t, (caddr_t)res);
2617 2617
2618 2618 return (retval);
2619 2619 }
2620 2620
2621 2621
2622 2622 /*
2623 2623 * res must be free'd by the thread we wake up
2624 2624 */
2625 2625 /* ARGSUSED */
2626 2626 int *
2627 2627 mdmn_wakeup_master_svc_2(md_mn_result_t *ores, struct svc_req *rqstp)
2628 2628 {
2629 2629
2630 2630 int *retval;
2631 2631 int err;
2632 2632 set_t setno;
2633 2633 cond_t *cv;
2634 2634 mutex_t *mx;
2635 2635 md_mn_msgid_t master_table_id;
2636 2636 md_mn_nodeid_t sender;
2637 2637 md_mn_result_t *res;
2638 2638 md_mn_msgclass_t class;
2639 2639
2640 2640 retval = Malloc(sizeof (int));
2641 2641
2642 2642 /* check if the global initialization is done */
2643 2643 if ((md_commd_global_state & MD_CGS_INITED) == 0) {
2644 2644 global_init();
2645 2645 }
2646 2646
2647 2647 /* Need to copy the results here, as they are static for RPC */
2648 2648 res = copy_result(ores);
2649 2649 xdr_free(xdr_md_mn_result_t, (caddr_t)ores);
2650 2650
2651 2651 class = mdmn_get_message_class(res->mmr_msgtype);
2652 2652 setno = res->mmr_setno;
2653 2653
2654 2654 if (md_mn_set_inited[setno] != MDMN_SET_READY) {
2655 2655 /* set not ready means we just crashed are restarted now */
2656 2656 /* Can only use the appropriate mutexes if they are inited */
2657 2657 if (md_mn_set_inited[setno] & MDMN_SET_MUTEXES) {
2658 2658 (void) rw_wrlock(&set_desc_rwlock[setno]);
2659 2659 (void) rw_wrlock(&client_rwlock[setno]);
2660 2660 err = mdmn_init_set(setno, MDMN_SET_READY);
2661 2661 (void) rw_unlock(&client_rwlock[setno]);
2662 2662 (void) rw_unlock(&set_desc_rwlock[setno]);
2663 2663 } else {
2664 2664 err = mdmn_init_set(setno, MDMN_SET_READY);
2665 2665 }
2666 2666
2667 2667 if (err) {
2668 2668 *retval = MDMNE_CANNOT_CONNECT;
2669 2669 xdr_free(xdr_md_mn_result_t, (caddr_t)res);
2670 2670 return (retval);
2671 2671 }
2672 2672 }
2673 2673
2674 2674 /* is this rpc request coming from a licensed node? */
2675 2675 if (check_license(rqstp, res->mmr_sender) == FALSE) {
2676 2676 *retval = MDMNE_RPC_FAIL;
2677 2677 xdr_free(xdr_md_mn_result_t, (caddr_t)res);
2678 2678 return (retval);
2679 2679 }
2680 2680
2681 2681
2682 2682 commd_debug(MD_MMV_WAKE_M,
2683 2683 "wake_mas: received (%d, 0x%llx-%d) set=%d, class=%d, type=%d "
2684 2684 "from %d\n",
2685 2685 MSGID_ELEMS(res->mmr_msgid), setno, class, res->mmr_msgtype,
2686 2686 res->mmr_sender);
2687 2687 /*
2688 2688 * The mutex and cv are needed for waking up the thread
2689 2689 * sleeping in mdmn_master_process_msg()
2690 2690 */
2691 2691 mx = mdmn_get_master_table_mx(setno, class);
2692 2692 cv = mdmn_get_master_table_cv(setno, class);
2693 2693
2694 2694 /*
2695 2695 * lookup the master wakeup table
2696 2696 * If we find our message, we are on the master and
2697 2697 * called by a slave that finished processing a message.
2698 2698 * We store the results in the appropriate slot and
2699 2699 * wakeup the thread (mdmn_master_process_msg()) waiting for them.
2700 2700 */
2701 2701 (void) mutex_lock(mx);
2702 2702 mdmn_get_master_table_id(setno, class, &master_table_id);
2703 2703 sender = mdmn_get_master_table_addr(setno, class);
2704 2704
2705 2705 if (MSGID_CMP(&(master_table_id), &(res->mmr_msgid))) {
2706 2706 if (sender == res->mmr_sender) {
2707 2707 mdmn_set_master_table_res(setno, class, res);
2708 2708 (void) cond_signal(cv);
2709 2709 *retval = MDMNE_ACK;
2710 2710 } else {
2711 2711 /* id is correct but wrong sender (I smell a timeout) */
2712 2712 commd_debug(MD_MMV_WAKE_M,
2713 2713 "wakeup master got unsolicited message: "
2714 2714 "(%d, 0x%llx-%d) from %d\n",
2715 2715 MSGID_ELEMS(res->mmr_msgid), res->mmr_sender);
2716 2716 free_result(res);
2717 2717 *retval = MDMNE_TIMEOUT;
2718 2718 }
2719 2719 } else {
2720 2720 /* id is wrong, smells like a very late timeout */
2721 2721 commd_debug(MD_MMV_WAKE_M,
2722 2722 "wakeup master got unsolicited message: "
2723 2723 "(%d, 0x%llx-%d) from %d, expected (%d, 0x%llx-%d)\n",
2724 2724 MSGID_ELEMS(res->mmr_msgid), res->mmr_sender,
2725 2725 MSGID_ELEMS(master_table_id));
2726 2726 free_result(res);
2727 2727 *retval = MDMNE_NO_WAKEUP_ENTRY;
2728 2728 }
2729 2729
2730 2730 (void) mutex_unlock(mx);
2731 2731
2732 2732 return (retval);
2733 2733 }
2734 2734
2735 2735 /*
2736 2736 * Lock a set/class combination.
2737 2737 * This is mainly done for debug purpose.
2738 2738 * This set/class combination immediately is blocked,
2739 2739 * even in the middle of sending messages to multiple slaves.
2740 2740 * This remains until the user issues a mdmn_comm_unlock_svc_2 for the same
2741 2741 * set/class combination.
2742 2742 *
2743 2743 * Special messages of class MD_MSG_CLASS0 can never be locked.
2744 2744 * e.g. MD_MN_MSG_VERBOSITY, MD_MN_MSG_ABORT
2745 2745 *
2746 2746 * That means, if MD_MSG_CLASS0 is specified, we lock all classes from
2747 2747 * >= MD_MSG_CLASS1 to < MD_MN_NCLASSES
2748 2748 *
2749 2749 * set must be between 1 and MD_MAXSETS
2750 2750 * class can be:
2751 2751 * MD_MSG_CLASS0 which means all other classes in this case
2752 2752 * or one specific class (< MD_MN_NCLASSES)
2753 2753 *
2754 2754 * Returns:
2755 2755 * MDMNE_ACK on sucess (locking a locked class is Ok)
2756 2756 * MDMNE_EINVAL if a parameter is out of range
2757 2757 */
2758 2758
2759 2759 /* ARGSUSED */
2760 2760 int *
2761 2761 mdmn_comm_lock_svc_2(md_mn_set_and_class_t *msc, struct svc_req *rqstp)
2762 2762 {
2763 2763 int *retval;
2764 2764 set_t setno = msc->msc_set;
2765 2765 md_mn_msgclass_t class = msc->msc_class;
2766 2766
2767 2767 retval = Malloc(sizeof (int));
2768 2768
2769 2769 /* check if the global initialization is done */
2770 2770 if ((md_commd_global_state & MD_CGS_INITED) == 0) {
2771 2771 global_init();
2772 2772 }
2773 2773
2774 2774 /* is this rpc request coming from the local node ? */
2775 2775 if (check_license(rqstp, 0) == FALSE) {
2776 2776 xdr_free(xdr_md_mn_set_and_class_t, (caddr_t)msc);
2777 2777 *retval = MDMNE_RPC_FAIL;
2778 2778 return (retval);
2779 2779 }
2780 2780
2781 2781 /* Perform some range checking */
2782 2782 if ((setno == 0) || (setno >= MD_MAXSETS) ||
2783 2783 (class < MD_MSG_CLASS0) || (class >= MD_MN_NCLASSES)) {
2784 2784 *retval = MDMNE_EINVAL;
2785 2785 return (retval);
2786 2786 }
2787 2787
2788 2788 commd_debug(MD_MMV_MISC, "lock: set=%d, class=%d\n", setno, class);
2789 2789 (void) mutex_lock(&mdmn_busy_mutex[setno]);
2790 2790 if (class != MD_MSG_CLASS0) {
2791 2791 mdmn_mark_class_locked(setno, class);
2792 2792 } else {
2793 2793 /* MD_MSG_CLASS0 is used as a wild card for all classes */
2794 2794 for (class = MD_MSG_CLASS1; class < MD_MN_NCLASSES; class++) {
2795 2795 mdmn_mark_class_locked(setno, class);
2796 2796 }
2797 2797 }
2798 2798 (void) mutex_unlock(&mdmn_busy_mutex[setno]);
2799 2799
2800 2800 *retval = MDMNE_ACK;
2801 2801 return (retval);
2802 2802 }
2803 2803
2804 2804 /*
2805 2805 * Unlock a set/class combination.
2806 2806 * set must be between 1 and MD_MAXSETS
2807 2807 * class can be:
2808 2808 * MD_MSG_CLASS0 which means all other classes in this case (like above)
2809 2809 * or one specific class (< MD_MN_NCLASSES)
2810 2810 *
2811 2811 * Returns:
2812 2812 * MDMNE_ACK on sucess (unlocking an unlocked class is Ok)
2813 2813 * MDMNE_EINVAL if a parameter is out of range
2814 2814 */
2815 2815 /* ARGSUSED */
2816 2816 int *
2817 2817 mdmn_comm_unlock_svc_2(md_mn_set_and_class_t *msc, struct svc_req *rqstp)
2818 2818 {
2819 2819 int *retval;
2820 2820 set_t setno = msc->msc_set;
2821 2821 md_mn_msgclass_t class = msc->msc_class;
2822 2822
2823 2823 retval = Malloc(sizeof (int));
2824 2824
2825 2825 /* check if the global initialization is done */
2826 2826 if ((md_commd_global_state & MD_CGS_INITED) == 0) {
2827 2827 global_init();
2828 2828 }
2829 2829
2830 2830 /* is this rpc request coming from the local node ? */
2831 2831 if (check_license(rqstp, 0) == FALSE) {
2832 2832 xdr_free(xdr_md_mn_set_and_class_t, (caddr_t)msc);
2833 2833 *retval = MDMNE_RPC_FAIL;
2834 2834 return (retval);
2835 2835 }
2836 2836
2837 2837 /* Perform some range checking */
2838 2838 if ((setno == 0) || (setno >= MD_MAXSETS) ||
2839 2839 (class < MD_MSG_CLASS0) || (class >= MD_MN_NCLASSES)) {
2840 2840 *retval = MDMNE_EINVAL;
2841 2841 return (retval);
2842 2842 }
2843 2843 commd_debug(MD_MMV_MISC, "unlock: set=%d, class=%d\n", setno, class);
2844 2844
2845 2845 (void) mutex_lock(&mdmn_busy_mutex[setno]);
2846 2846 if (class != MD_MSG_CLASS0) {
2847 2847 mdmn_mark_class_unlocked(setno, class);
2848 2848 } else {
2849 2849 /* MD_MSG_CLASS0 is used as a wild card for all classes */
2850 2850 for (class = MD_MSG_CLASS1; class < MD_MN_NCLASSES; class++) {
2851 2851 mdmn_mark_class_unlocked(setno, class);
2852 2852 }
2853 2853 }
2854 2854 (void) mutex_unlock(&mdmn_busy_mutex[setno]);
2855 2855
2856 2856 *retval = MDMNE_ACK;
2857 2857 return (retval);
2858 2858 }
2859 2859
2860 2860 /*
2861 2861 * mdmn_comm_suspend_svc_2(setno, class)
2862 2862 *
2863 2863 * Drain all outstanding messages for a given set/class combination
2864 2864 * and don't allow new messages to be processed.
2865 2865 *
2866 2866 * Special messages of class MD_MSG_CLASS0 can never be locked.
2867 2867 * e.g. MD_MN_MSG_VERBOSITY
2868 2868 *
2869 2869 * 1 <= setno < MD_MAXSETS or setno == MD_COMM_ALL_SETS
2870 2870 * 1 <= class < MD_MN_NCLASSES or class == MD_COMM_ALL_CLASSES
2871 2871 *
2872 2872 * If class _is_not_ MD_COMM_ALL_CLASSES, then we simply mark this
2873 2873 * one class as being suspended.
2874 2874 * If messages for this class are currently on their way,
2875 2875 * MDMNE_SET_NOT_DRAINED is returned. Otherwise MDMNE_ACK is returned.
2876 2876 *
2877 2877 * If class _is_ MD_COMM_ALL_CLASSES we drain all classes of this set.
2878 2878 * Messages must be generated in ascending order.
2879 2879 * This means, a message cannot create submessages with the same or lower class.
2880 2880 * Draining messages must go from 1 to NCLASSES in order to ensure we don't
2881 2881 * generate a hanging situation here.
2882 2882 * We mark class 1 as being suspended.
2883 2883 * if the class is not busy, we proceed with class 2
2884 2884 * and so on
2885 2885 * if a class *is* busy, we cannot continue here, but return
2886 2886 * MDMNE_SET_NOT_DRAINED.
2887 2887 * We expect the caller to hold on for some seconds and try again.
2888 2888 * When that message, that held the class busy is done in
2889 2889 * mdmn_master_process_msg(), mdmn_mark_class_unbusy() called.
2890 2890 * There it is checked if the class is about to drain.
2891 2891 * In that case it tries to drain all higher classes there.
2892 2892 *
2893 2893 * If setno is MD_COMM_ALL_SETS then we perform this on all possible sets.
2894 2894 * In that case we return MDMNE_SET_NOT_DRAINED if not all sets are
2895 2895 * completely drained.
2896 2896 *
2897 2897 * Returns:
2898 2898 * MDMNE_ACK on sucess (set is drained, no outstanding messages)
2899 2899 * MDMNE_SET_NOT_DRAINED if drain process is started, but there are
2900 2900 * still outstanding messages for this set(s)
2901 2901 * MDMNE_EINVAL if setno is out of range
2902 2902 * MDMNE_NOT_JOINED if the set is not yet initialized on this node
2903 2903 */
2904 2904
2905 2905 /* ARGSUSED */
2906 2906 int *
2907 2907 mdmn_comm_suspend_svc_2(md_mn_set_and_class_t *msc, struct svc_req *rqstp)
2908 2908 {
2909 2909 int *retval;
2910 2910 int failure = 0;
2911 2911 set_t startset, endset;
2912 2912 set_t setno = msc->msc_set;
2913 2913 md_mn_msgclass_t oclass = msc->msc_class;
2914 2914 #ifdef NOT_YET_NEEDED
2915 2915 uint_t flags = msc->msc_flags;
2916 2916 #endif /* NOT_YET_NEEDED */
2917 2917 md_mn_msgclass_t class;
2918 2918
2919 2919 retval = Malloc(sizeof (int));
2920 2920
2921 2921 /* check if the global initialization is done */
2922 2922 if ((md_commd_global_state & MD_CGS_INITED) == 0) {
2923 2923 global_init();
2924 2924 }
2925 2925
2926 2926 /* is this rpc request coming from the local node ? */
2927 2927 if (check_license(rqstp, 0) == FALSE) {
2928 2928 xdr_free(xdr_md_mn_set_and_class_t, (caddr_t)msc);
2929 2929 *retval = MDMNE_RPC_FAIL;
2930 2930 return (retval);
2931 2931 }
2932 2932
2933 2933 commd_debug(MD_MMV_MISC, "suspend: called for set=%d class=%d\n",
2934 2934 setno, oclass);
2935 2935
2936 2936 /* Perform some range checking */
2937 2937 if (setno >= MD_MAXSETS) {
2938 2938 *retval = MDMNE_EINVAL;
2939 2939 commd_debug(MD_MMV_MISC, "suspend: returning MDMNE_EINVAL\n");
2940 2940 return (retval);
2941 2941 }
2942 2942
2943 2943 /* setno == MD_COMM_ALL_SETS means: we walk thru all possible sets. */
2944 2944 if (setno == MD_COMM_ALL_SETS) {
2945 2945 startset = 1;
2946 2946 endset = MD_MAXSETS - 1;
2947 2947 } else {
2948 2948 startset = setno;
2949 2949 endset = setno;
2950 2950 }
2951 2951
2952 2952 for (setno = startset; setno <= endset; setno++) {
2953 2953 /* Here we need the mutexes for the set to be setup */
2954 2954 if (md_mn_set_inited[setno] != MDMN_SET_MUTEXES) {
2955 2955 (void) mdmn_init_set(setno, MDMN_SET_MUTEXES);
2956 2956 }
2957 2957
2958 2958 (void) mutex_lock(&mdmn_busy_mutex[setno]);
2959 2959 /* shall we drain all classes of this set? */
2960 2960 if (oclass == MD_COMM_ALL_CLASSES) {
2961 2961 for (class = 1; class < MD_MN_NCLASSES; class ++) {
2962 2962 commd_debug(MD_MMV_MISC,
2963 2963 "suspend: suspending set %d, class %d\n",
2964 2964 setno, class);
2965 2965 *retval = mdmn_mark_class_suspended(setno,
2966 2966 class, MDMN_SUSPEND_ALL);
2967 2967 if (*retval == MDMNE_SET_NOT_DRAINED) {
2968 2968 failure++;
2969 2969 }
2970 2970 }
2971 2971 } else {
2972 2972 /* only drain one specific class */
2973 2973 commd_debug(MD_MMV_MISC,
2974 2974 "suspend: suspending set=%d class=%d\n",
2975 2975 setno, oclass);
2976 2976 *retval = mdmn_mark_class_suspended(setno, oclass,
2977 2977 MDMN_SUSPEND_1);
2978 2978 if (*retval == MDMNE_SET_NOT_DRAINED) {
2979 2979 failure++;
2980 2980 }
2981 2981 }
2982 2982 (void) mutex_unlock(&mdmn_busy_mutex[setno]);
2983 2983 }
2984 2984 /* If one or more sets are not entirely drained, failure is non-zero */
2985 2985 if (failure != 0) {
2986 2986 *retval = MDMNE_SET_NOT_DRAINED;
2987 2987 commd_debug(MD_MMV_MISC,
2988 2988 "suspend: returning MDMNE_SET_NOT_DRAINED\n");
2989 2989 } else {
2990 2990 *retval = MDMNE_ACK;
2991 2991 }
2992 2992
2993 2993 return (retval);
2994 2994 }
2995 2995
2996 2996 /*
2997 2997 * mdmn_comm_resume_svc_2(setno, class)
2998 2998 *
2999 2999 * Resume processing messages for a given set.
3000 3000 * This incorporates the repeal of a previous suspend operation.
3001 3001 *
3002 3002 * 1 <= setno < MD_MAXSETS or setno == MD_COMM_ALL_SETS
3003 3003 * 1 <= class < MD_MN_NCLASSES or class == MD_COMM_ALL_CLASSES
3004 3004 *
3005 3005 * If class _is_not_ MD_COMM_ALL_CLASSES, then we simply mark this
3006 3006 * one class as being resumed.
3007 3007 *
3008 3008 * If class _is_ MD_COMM_ALL_CLASSES we resume all classes of this set.
3009 3009 *
3010 3010 * If setno is MD_COMM_ALL_SETS then we perform this on all possible sets.
3011 3011 *
3012 3012 * If both setno is MD_COMM_ALL_SETS and class is MD_COMM_ALL_CLASSES we also
3013 3013 * reset any ABORT flag from the global state.
3014 3014 *
3015 3015 * Returns:
3016 3016 * MDMNE_ACK on sucess (resuming an unlocked set is Ok)
3017 3017 * MDMNE_EINVAL if setno is out of range
3018 3018 * MDMNE_NOT_JOINED if the set is not yet initialized on this node
3019 3019 */
3020 3020 /* ARGSUSED */
3021 3021 int *
3022 3022 mdmn_comm_resume_svc_2(md_mn_set_and_class_t *msc, struct svc_req *rqstp)
3023 3023 {
3024 3024 int *retval;
3025 3025 set_t startset, endset;
3026 3026 set_t setno = msc->msc_set;
3027 3027 md_mn_msgclass_t oclass = msc->msc_class;
3028 3028 uint_t flags = msc->msc_flags;
3029 3029 md_mn_msgclass_t class;
3030 3030
3031 3031 retval = Malloc(sizeof (int));
3032 3032
3033 3033 /* check if the global initialization is done */
3034 3034 if ((md_commd_global_state & MD_CGS_INITED) == 0) {
3035 3035 global_init();
3036 3036 }
3037 3037
3038 3038 /* is this rpc request coming from the local node ? */
3039 3039 if (check_license(rqstp, 0) == FALSE) {
3040 3040 xdr_free(xdr_md_mn_set_and_class_t, (caddr_t)msc);
3041 3041 *retval = MDMNE_RPC_FAIL;
3042 3042 return (retval);
3043 3043 }
3044 3044
3045 3045 commd_debug(MD_MMV_MISC, "resume: called for set=%d class=%d\n",
3046 3046 setno, oclass);
3047 3047
3048 3048 /* Perform some range checking */
3049 3049 if (setno > MD_MAXSETS) {
3050 3050 *retval = MDMNE_EINVAL;
3051 3051 return (retval);
3052 3052 }
3053 3053
3054 3054 if (setno == MD_COMM_ALL_SETS) {
3055 3055 startset = 1;
3056 3056 endset = MD_MAXSETS - 1;
3057 3057 if (oclass == MD_COMM_ALL_CLASSES) {
3058 3058 /* This is the point where we "unabort" the commd */
3059 3059 commd_debug(MD_MMV_MISC, "resume: resetting ABORT\n");
3060 3060 md_commd_global_state &= ~MD_CGS_ABORTED;
3061 3061 }
3062 3062 } else {
3063 3063 startset = setno;
3064 3064 endset = setno;
3065 3065 }
3066 3066
3067 3067 for (setno = startset; setno <= endset; setno++) {
3068 3068
3069 3069 /* Here we need the mutexes for the set to be setup */
3070 3070 if ((md_mn_set_inited[setno] & MDMN_SET_MUTEXES) == 0) {
3071 3071 (void) mdmn_init_set(setno, MDMN_SET_MUTEXES);
3072 3072 }
3073 3073
3074 3074 (void) mutex_lock(&mdmn_busy_mutex[setno]);
3075 3075
3076 3076 if (oclass == MD_COMM_ALL_CLASSES) {
3077 3077 int end_class = 1;
3078 3078 /*
3079 3079 * When SUSPENDing all classes, we go
3080 3080 * from 1 to MD_MN_NCLASSES-1
3081 3081 * The correct reverse action is RESUMing
3082 3082 * from MD_MN_NCLASSES-1 to 1 (or 2)
3083 3083 */
3084 3084
3085 3085 if (flags & MD_MSCF_DONT_RESUME_CLASS1) {
3086 3086 end_class = 2;
3087 3087 }
3088 3088
3089 3089 /*
3090 3090 * Then mark all classes of this set as no longer
3091 3091 * suspended. This supersedes any previous suspend(1)
3092 3092 * calls and resumes the set entirely.
3093 3093 */
3094 3094 for (class = MD_MN_NCLASSES - 1; class >= end_class;
3095 3095 class --) {
3096 3096 commd_debug(MD_MMV_MISC,
3097 3097 "resume: resuming set=%d class=%d\n",
3098 3098 setno, class);
3099 3099 mdmn_mark_class_resumed(setno, class,
3100 3100 (MDMN_SUSPEND_ALL | MDMN_SUSPEND_1));
3101 3101 }
3102 3102 } else {
3103 3103 /*
3104 3104 * In this case only one class is marked as not
3105 3105 * suspended. If a suspend(all) is currently active for
3106 3106 * this set, this class will still be suspended.
3107 3107 * That state will be cleared by a suspend(all)
3108 3108 * (see above)
3109 3109 */
3110 3110 commd_debug(MD_MMV_MISC,
3111 3111 "resume: resuming set=%d class=%d\n",
3112 3112 setno, oclass);
3113 3113 mdmn_mark_class_resumed(setno, oclass, MDMN_SUSPEND_1);
3114 3114 }
3115 3115
3116 3116 (void) mutex_unlock(&mdmn_busy_mutex[setno]);
3117 3117 }
3118 3118
3119 3119 *retval = MDMNE_ACK;
3120 3120 return (retval);
3121 3121 }
3122 3122 /* ARGSUSED */
3123 3123 int *
3124 3124 mdmn_comm_reinit_set_svc_2(set_t *setnop, struct svc_req *rqstp)
3125 3125 {
3126 3126 int *retval;
3127 3127 md_mnnode_desc *node;
3128 3128 set_t setno = *setnop;
3129 3129
3130 3130 retval = Malloc(sizeof (int));
3131 3131
3132 3132 /* check if the global initialization is done */
3133 3133 if ((md_commd_global_state & MD_CGS_INITED) == 0) {
3134 3134 global_init();
3135 3135 }
3136 3136
3137 3137 /* is this rpc request coming from the local node ? */
3138 3138 if (check_license(rqstp, 0) == FALSE) {
3139 3139 xdr_free(xdr_set_t, (caddr_t)setnop);
3140 3140 *retval = MDMNE_RPC_FAIL;
3141 3141 return (retval);
3142 3142 }
3143 3143
3144 3144 commd_debug(MD_MMV_MISC, "reinit: set=%d\n", setno);
3145 3145
3146 3146 (void) rw_rdlock(&set_desc_rwlock[setno]);
3147 3147 /*
3148 3148 * We assume, that all messages have been suspended previously.
3149 3149 *
3150 3150 * As we are modifying lots of clients here we grab the client_rwlock
3151 3151 * in writer mode. This ensures, no new messages come in.
3152 3152 */
3153 3153 (void) rw_wrlock(&client_rwlock[setno]);
3154 3154 /* This set is no longer initialized */
3155 3155
3156 3156 if ((set_descriptor[setno] != NULL) &&
3157 3157 (md_mn_set_inited[setno] & MDMN_SET_NODES)) {
3158 3158 /* destroy all rpc clients from this set */
3159 3159 for (node = set_descriptor[setno]->sd_nodelist; node;
3160 3160 node = node->nd_next) {
3161 3161 /*
3162 3162 * Since the CLIENT for ourself will be recreated
3163 3163 * shortly, and this node is guaranteed to be
3164 3164 * there after a reconfig, there's no reason to go
3165 3165 * through destroying it. It also avoids an issue
3166 3166 * with calling clnt_create() later from within the
3167 3167 * server thread, which can effectively deadlock
3168 3168 * itself due to RPC design limitations.
3169 3169 */
3170 3170 if (node == set_descriptor[setno]->sd_mn_mynode)
3171 3171 continue;
3172 3172 mdmn_clnt_destroy(client[setno][node->nd_nodeid]);
3173 3173 if (client[setno][node->nd_nodeid] != (CLIENT *)NULL) {
3174 3174 client[setno][node->nd_nodeid] = (CLIENT *)NULL;
3175 3175 }
3176 3176 }
3177 3177 md_mn_set_inited[setno] &= ~MDMN_SET_NODES;
3178 3178 }
3179 3179
3180 3180 commd_debug(MD_MMV_MISC, "reinit: done init_set(%d)\n", setno);
3181 3181
3182 3182 (void) rw_unlock(&client_rwlock[setno]);
3183 3183 (void) rw_unlock(&set_desc_rwlock[setno]);
3184 3184 *retval = MDMNE_ACK;
3185 3185 return (retval);
3186 3186 }
3187 3187
3188 3188 /*
3189 3189 * This is just an interface for testing purpose.
3190 3190 * Here we can disable single message types.
3191 3191 * If we block a message type, this is valid for all MN sets.
3192 3192 * If a message arrives later, and it's message type is blocked, it will
3193 3193 * be returned immediately with MDMNE_CLASS_LOCKED, which causes the sender to
3194 3194 * resend this message over and over again.
3195 3195 */
3196 3196
3197 3197 /* ARGSUSED */
3198 3198 int *
3199 3199 mdmn_comm_msglock_svc_2(md_mn_type_and_lock_t *mmtl, struct svc_req *rqstp)
3200 3200 {
3201 3201 int *retval;
3202 3202 md_mn_msgtype_t type = mmtl->mmtl_type;
3203 3203 uint_t lock = mmtl->mmtl_lock;
3204 3204
3205 3205 retval = Malloc(sizeof (int));
3206 3206
3207 3207 /* check if the global initialization is done */
3208 3208 if ((md_commd_global_state & MD_CGS_INITED) == 0) {
3209 3209 global_init();
3210 3210 }
3211 3211
3212 3212 /* is this rpc request coming from the local node ? */
3213 3213 if (check_license(rqstp, 0) == FALSE) {
3214 3214 xdr_free(xdr_md_mn_type_and_lock_t, (caddr_t)mmtl);
3215 3215 *retval = MDMNE_RPC_FAIL;
3216 3216 return (retval);
3217 3217 }
3218 3218
3219 3219 /* Perform some range checking */
3220 3220 if ((type == 0) || (type >= MD_MN_NMESSAGES)) {
3221 3221 *retval = MDMNE_EINVAL;
3222 3222 return (retval);
3223 3223 }
3224 3224
3225 3225 commd_debug(MD_MMV_MISC, "msglock: type=%d, lock=%d\n", type, lock);
3226 3226 msgtype_lock_state[type] = lock;
3227 3227
3228 3228 *retval = MDMNE_ACK;
3229 3229 return (retval);
3230 3230 }
↓ open down ↓ |
2983 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX