1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 #include <sys/types.h>
  28 #include <sys/resource.h>
  29 #include <sys/priocntl.h>
  30 #include <sys/rtpriocntl.h>
  31 #include <sys/tspriocntl.h>
  32 #include <sys/wait.h>
  33 #include <sys/stat.h>
  34 
  35 #include <strings.h>
  36 #include <thread.h>
  37 #include <stdlib.h>
  38 #include <signal.h>
  39 #include <errno.h>
  40 #include <stdio.h>
  41 #include <fcntl.h>
  42 #include <locale.h>
  43 #include <unistd.h>
  44 #include <syslog.h>
  45 
  46 #include <sys/nsctl/cfg.h>
  47 #include <sys/nsctl/nsctl.h>
  48 #include <sys/nsctl/nsc_ioctl.h>
  49 #include <sys/nskernd.h>
  50 #include <nsctl.h>
  51 
  52 #include <sys/mkdev.h>
  53 #include <sys/nsctl/sv_efi.h>
  54 
  55 static const char *rdev = "/dev/nsctl";
  56 
  57 /*
  58  * Define a minimal user stack size in bytes over and above the
  59  * libthread THR_STACK_MIN minimum value.
  60  *
  61  * This stack size needs to be sufficient to run _newlwp() and then
  62  * ioctl() down into the kernel.
  63  */
  64 #define NSK_STACK_SIZE  512
  65 
  66 /*
  67  * LWP scheduling control switches.
  68  *
  69  * allow_pri    - set to non-zero to enable priocntl() manipulations of
  70  *              created LWPs.
  71  * allow_rt     - set to non-zero to use the RT rather than the TS
  72  *              scheduling class when manipulating the schduling
  73  *              parameters for an LWP.  Only used if allow_pri is
  74  *              non-zero.
  75  */
  76 static int allow_pri = 1;
  77 static int allow_rt = 0;        /* disallow - bad interactions with timeout() */
  78 
  79 static int nsctl_fd = -1;
  80 static int sigterm;
  81 
  82 static int nthreads;            /* number of threads in the kernel */
  83 static int exiting;             /* shutdown in progress flag */
  84 static mutex_t thr_mutex = DEFAULTMUTEX;
  85 static mutex_t cfg_mutex = DEFAULTMUTEX;
  86 
  87 static int cl_nodeid = -1;
  88 
  89 static int display_msg = 0;
  90 static int delay_time = 30;
  91 
  92 static void
  93 usage(void)
  94 {
  95         (void) fprintf(stderr, gettext("usage: nskernd\n"));
  96         exit(255);
  97 }
  98 
  99 
 100 static void
 101 sighand(int sig)
 102 {
 103         if (sig == SIGTERM) {
 104                 sigterm++;
 105         }
 106 }
 107 
 108 
 109 /*
 110  * Returns: 1 - can enter kernel; 0 - shutdown in progress, do not enter kernel
 111  */
 112 int
 113 nthread_inc(void)
 114 {
 115         (void) mutex_lock(&thr_mutex);
 116         if (exiting) {
 117                 /* cannot enter kernel as nskernd is being shutdown - exit */
 118                 (void) mutex_unlock(&thr_mutex);
 119                 return (0);
 120         }
 121         nthreads++;
 122         (void) mutex_unlock(&thr_mutex);
 123         return (1);
 124 }
 125 
 126 
 127 void
 128 nthread_dec(void)
 129 {
 130         (void) mutex_lock(&thr_mutex);
 131         nthreads--;
 132         (void) mutex_unlock(&thr_mutex);
 133 }
 134 
 135 
 136 /*
 137  * returns: 1 - can shutdown; 0 - unable to shutdown
 138  */
 139 int
 140 canshutdown(void)
 141 {
 142         int rc = 1;
 143         time_t  start_delay;
 144 
 145         (void) mutex_lock(&thr_mutex);
 146         if (nthreads > 0) {
 147                 if (display_msg) {
 148                         (void) fprintf(stderr,
 149                             gettext("nskernd: unable to shutdown: "
 150                             "%d kernel threads in use\n"), nthreads);
 151                 }
 152                 start_delay = time(0);
 153                 while (nthreads > 0 && (time(0) - start_delay) < delay_time) {
 154                         (void) mutex_unlock(&thr_mutex);
 155                         (void) sleep(1);
 156                         (void) mutex_lock(&thr_mutex);
 157                         (void) fprintf(stderr,
 158                             gettext("nskernd:   delay shutdown: "
 159                             "%d kernel threads in use\n"), nthreads);
 160                 }
 161                 if (nthreads > 0) {
 162                         rc = 0;
 163                 } else {
 164                         exiting = 1;
 165                 }
 166         } else {
 167                 /* flag shutdown in progress */
 168                 exiting = 1;
 169         }
 170         (void) mutex_unlock(&thr_mutex);
 171 
 172         return (rc);
 173 }
 174 
 175 
 176 /*
 177  * returns: 1 - shutdown successful; 0 - unable to shutdown
 178  */
 179 int
 180 shutdown(void)
 181 {
 182         struct nskernd data;
 183         int rc;
 184 
 185         if (nsctl_fd < 0)
 186                 return (1);
 187 
 188         bzero(&data, sizeof (data));
 189         data.command = NSKERND_STOP;
 190 
 191         if (!canshutdown()) {
 192                 return (0);
 193         }
 194 
 195         rc = ioctl(nsctl_fd, NSCIOC_NSKERND, &data);
 196         if (rc < 0) {
 197                 if (errno != EINTR || !sigterm) {
 198                         (void) fprintf(stderr,
 199                             gettext("nskernd: NSKERND_STOP failed\n"));
 200                 }
 201         }
 202 
 203         return (1);
 204 }
 205 
 206 
 207 /*
 208  * First function run by a NSKERND_NEWLWP thread.
 209  *
 210  * Determines if it needs to change the scheduling priority of the LWP,
 211  * and then calls back into the kernel.
 212  */
 213 static void *
 214 _newlwp(void *arg)
 215 {
 216         struct nskernd nsk;
 217         pcparms_t pcparms;
 218         pcinfo_t pcinfo;
 219 
 220         /* copy arguments onto stack and free heap memory */
 221         bcopy(arg, &nsk, sizeof (nsk));
 222         free(arg);
 223 
 224         if (nsk.data2 && allow_pri) {
 225                 /* increase the scheduling priority of this LWP */
 226 
 227                 bzero(&pcinfo, sizeof (pcinfo));
 228                 (void) strcpy(pcinfo.pc_clname, allow_rt ? "RT" : "TS");
 229 
 230                 if (priocntl(0, 0, PC_GETCID, (char *)&pcinfo) < 0) {
 231                         (void) fprintf(stderr,
 232                             gettext(
 233                             "nskernd: priocntl(PC_GETCID) failed: %s\n"),
 234                             strerror(errno));
 235                         goto pri_done;
 236                 }
 237 
 238                 bzero(&pcparms, sizeof (pcparms));
 239                 pcparms.pc_cid = pcinfo.pc_cid;
 240 
 241                 if (allow_rt) {
 242                         ((rtparms_t *)pcparms.pc_clparms)->rt_pri =
 243                                 (pri_t)0; /* minimum RT priority */
 244                         ((rtparms_t *)pcparms.pc_clparms)->rt_tqsecs =
 245                                 (uint_t)RT_TQDEF;
 246                         ((rtparms_t *)pcparms.pc_clparms)->rt_tqnsecs =
 247                                 RT_TQDEF;
 248                 } else {
 249                         ((tsparms_t *)pcparms.pc_clparms)->ts_uprilim =
 250                                 ((tsinfo_t *)&pcinfo.pc_clinfo)->ts_maxupri;
 251                         ((tsparms_t *)pcparms.pc_clparms)->ts_upri =
 252                                 ((tsinfo_t *)&pcinfo.pc_clinfo)->ts_maxupri;
 253                 }
 254 
 255                 if (priocntl(P_LWPID, P_MYID,
 256                     PC_SETPARMS, (char *)&pcparms) < 0) {
 257                         (void) fprintf(stderr,
 258                             gettext(
 259                             "nskernd: priocntl(PC_SETPARMS) failed: %s\n"),
 260                             strerror(errno));
 261                 }
 262         }
 263 
 264 pri_done:
 265         if (nthread_inc()) {
 266                 (void) ioctl(nsctl_fd, NSCIOC_NSKERND, &nsk);
 267                 nthread_dec();
 268         }
 269         return (NULL);
 270 }
 271 
 272 
 273 /*
 274  * Start a new thread bound to an LWP.
 275  *
 276  * This is the user level side of nsc_create_process().
 277  */
 278 static void
 279 newlwp(struct nskernd *req)
 280 {
 281         struct nskernd *nskp;
 282         thread_t tid;
 283         int rc;
 284 
 285         nskp = malloc(sizeof (*nskp));
 286         if (!nskp) {
 287 #ifdef DEBUG
 288                 (void) fprintf(stderr, gettext("nskernd: malloc(%d) failed\n"),
 289                     sizeof (*nskp));
 290 #endif
 291                 req->data1 = (uint64_t)ENOMEM;
 292                 return;
 293         }
 294 
 295         /* copy args for child */
 296         bcopy(req, nskp, sizeof (*nskp));
 297 
 298         rc = thr_create(NULL, (THR_MIN_STACK + NSK_STACK_SIZE),
 299                 _newlwp, nskp, THR_BOUND|THR_DETACHED, &tid);
 300 
 301         if (rc != 0) {
 302                 /* thr_create failed */
 303 #ifdef DEBUG
 304                 (void) fprintf(stderr,
 305                     gettext("nskernd: thr_create failed: %s\n"),
 306                     strerror(errno));
 307 #endif
 308                 req->data1 = (uint64_t)errno;
 309                 free(nskp);
 310         } else {
 311                 /* success - _newlwp() will free nskp */
 312                 req->data1 = (uint64_t)0;
 313         }
 314 }
 315 
 316 static int
 317 log_iibmp_err(char *set, int flags)
 318 {
 319         CFGFILE *cfg;
 320         char key[CFG_MAX_KEY];
 321         char buf[CFG_MAX_BUF];
 322         char newflags[CFG_MAX_BUF];
 323         char outbuf[CFG_MAX_BUF];
 324         char *mst, *shd, *bmp, *mode, *ovr, *cnode, *opt, *grp;
 325         int setno, found = 0;
 326         int setlen;
 327         int rc = 0;
 328         pid_t pid = -1;
 329 
 330         if (set && *set) {
 331                 setlen = strlen(set);
 332         } else {
 333                 return (EINVAL);
 334         }
 335 
 336         (void) mutex_lock(&cfg_mutex);
 337         cfg = cfg_open("");
 338         if (!cfg) {
 339                 (void) mutex_unlock(&cfg_mutex);
 340                 return (ENXIO);
 341         }
 342 
 343         if (!cfg_lock(cfg, CFG_WRLOCK)) {
 344 
 345                 (void) mutex_unlock(&cfg_mutex);
 346                 cfg_close(cfg);
 347 
 348                 pid = fork();
 349 
 350                 if (pid == -1) {
 351                         (void) fprintf(stderr, gettext(
 352                             "nskernd: Error forking\n"));
 353                         return (errno);
 354                 } else if (pid > 0) {
 355                         (void) fprintf(stdout, gettext(
 356                             "nskernd: Attempting deferred bitmap error\n"));
 357                         return (0);
 358                 }
 359 
 360                 (void) mutex_lock(&cfg_mutex);
 361                 cfg = cfg_open("");
 362                 if (!cfg) {
 363                         (void) mutex_unlock(&cfg_mutex);
 364                         (void) fprintf(stderr, gettext(
 365                             "nskernd: Failed cfg_open, deferred bitmap\n"));
 366                         return (ENXIO);
 367                 }
 368 
 369                 /* Sooner or later, this lock will be free */
 370                 while (!cfg_lock(cfg, CFG_WRLOCK))
 371                         (void) sleep(2);
 372         }
 373 
 374         /* find the proper set number */
 375         for (setno = 1; !found; setno++) {
 376                 (void) snprintf(key, CFG_MAX_KEY, "ii.set%d", setno);
 377                 if (cfg_get_cstring(cfg, key, buf, CFG_MAX_BUF) < 0) {
 378                         break;
 379                 }
 380 
 381                 mst = strtok(buf, " ");
 382                 shd = strtok(NULL, " ");
 383                 if (strncmp(shd, set, setlen) == 0) {
 384                         found = 1;
 385 
 386                         bmp = strtok(NULL, " ");
 387                         mode = strtok(NULL, " ");
 388                         ovr = strtok(NULL, " ");
 389                         cnode = strtok(NULL, " ");
 390                         opt = strtok(NULL, " ");
 391                         grp = strtok(NULL, " ");
 392                         break;
 393                 }
 394         }
 395 
 396         if (found) {
 397                 /* were there flags in the options field already? */
 398                 (void) snprintf(newflags, CFG_MAX_BUF, "%s=0x%x",
 399                     NSKERN_II_BMP_OPTION, flags);
 400                 if (opt && strcmp(opt, "-") != 0) {
 401                         bzero(newflags, CFG_MAX_BUF);
 402                         opt = strtok(opt, ";");
 403                         while (opt) {
 404                                 if (strncmp(opt, NSKERN_II_BMP_OPTION,
 405                                     strlen(NSKERN_II_BMP_OPTION)) != 0) {
 406                                         (void) strcat(newflags, ";");
 407                                         (void) strcat(newflags, opt);
 408                                 }
 409                         }
 410                 }
 411                 (void) snprintf(key, CFG_MAX_KEY, "ii.set%d", setno);
 412                 (void) snprintf(outbuf, CFG_MAX_BUF, "%s %s %s %s %s %s %s %s",
 413                     mst, shd, bmp, mode, ovr, cnode, newflags, grp);
 414                 if (cfg_put_cstring(cfg, key, outbuf, CFG_MAX_BUF) < 0) {
 415                         (void) printf("Failed to put [%s]\n", outbuf);
 416                         rc = ENXIO;
 417                 } else {
 418                         (void) cfg_commit(cfg);
 419                         rc = 0;
 420                 }
 421         } else {
 422                 (void) fprintf(stderr, gettext(
 423                     "nskernd: Failed deferred bitmap [%s]\n"), set);
 424                 rc = EINVAL;
 425         }
 426         cfg_unlock(cfg);
 427         cfg_close(cfg);
 428         (void) mutex_unlock(&cfg_mutex);
 429 
 430         /*
 431          * if we are the fork'ed client, just exit, if parent just return
 432          */
 433         if (pid == 0) {
 434                 exit(rc);
 435                 /*NOTREACHED*/
 436         } else {
 437                 return (rc);
 438         }
 439 }
 440 
 441 /*
 442  * First function run by a NSKERND_LOCK thread.
 443  *
 444  * Opens dscfg and locks it,
 445  * and then calls back into the kernel.
 446  *
 447  * Incoming:
 448  *      data1 is the kernel address of the sync structure.
 449  *      data2 is read(0)/write(1) lock mode.
 450  *
 451  * Returns:
 452  *      data1 as incoming.
 453  *      data2 errno.
 454  */
 455 static void *
 456 _dolock(void *arg)
 457 {
 458         struct nskernd nsk;
 459         CFGFILE *cfg;
 460         int locked;
 461         int mode;
 462         int rc = 0;
 463 
 464         /* copy arguments onto stack and free heap memory */
 465         bcopy(arg, &nsk, sizeof (nsk));
 466         free(arg);
 467 
 468         (void) mutex_lock(&cfg_mutex);
 469         cfg = cfg_open("");
 470         if (cfg == NULL) {
 471 #ifdef DEBUG
 472                 (void) fprintf(stderr,
 473                     gettext("nskernd: cfg_open failed: %s\n"),
 474                     strerror(errno));
 475 #endif
 476                 rc = ENXIO;
 477         }
 478 
 479         if (nsk.data2 == 0) {
 480                 mode = CFG_RDLOCK;
 481         } else {
 482                 mode = CFG_WRLOCK;
 483         }
 484 
 485         locked = 0;
 486         if (rc == 0) {
 487                 if (cfg_lock(cfg, mode)) {
 488                         locked = 1;
 489                 } else {
 490 #ifdef DEBUG
 491                         (void) fprintf(stderr,
 492                             gettext("nskernd: cfg_lock failed: %s\n"),
 493                             strerror(errno));
 494 #endif
 495                         rc = EINVAL;
 496                 }
 497         }
 498 
 499         /* return to kernel */
 500 
 501         nsk.data2 = (uint64_t)rc;
 502         if (nthread_inc()) {
 503                 (void) ioctl(nsctl_fd, NSCIOC_NSKERND, &nsk);
 504                 nthread_dec();
 505         }
 506 
 507         /* cleanup */
 508 
 509         if (locked) {
 510                 cfg_unlock(cfg);
 511                 locked = 0;
 512         }
 513 
 514         if (cfg != NULL) {
 515                 cfg_close(cfg);
 516                 cfg = NULL;
 517         }
 518         (void) mutex_unlock(&cfg_mutex);
 519 
 520         return (NULL);
 521 }
 522 
 523 
 524 /*
 525  * Inter-node lock thread.
 526  *
 527  * This is the user level side of nsc_rmlock().
 528  */
 529 static void
 530 dolock(struct nskernd *req)
 531 {
 532         struct nskernd *nskp;
 533         thread_t tid;
 534         int rc;
 535 
 536         /* create a new thread to do the lock and return to kernel */
 537 
 538         nskp = malloc(sizeof (*nskp));
 539         if (!nskp) {
 540 #ifdef DEBUG
 541                 (void) fprintf(stderr,
 542                     gettext("nskernd:dolock: malloc(%d) failed\n"),
 543                     sizeof (*nskp));
 544 #endif
 545                 req->data1 = (uint64_t)ENOMEM;
 546                 return;
 547         }
 548 
 549         /* copy args for child */
 550         bcopy(req, nskp, sizeof (*nskp));
 551 
 552         rc = thr_create(NULL, (THR_MIN_STACK + NSK_STACK_SIZE),
 553             _dolock, nskp, THR_BOUND|THR_DETACHED, &tid);
 554 
 555         if (rc != 0) {
 556                 /* thr_create failed */
 557 #ifdef DEBUG
 558                 (void) fprintf(stderr,
 559                     gettext("nskernd: thr_create failed: %s\n"),
 560                     strerror(errno));
 561 #endif
 562                 req->data1 = (uint64_t)errno;
 563                 free(nskp);
 564         } else {
 565                 /* success - _dolock() will free nskp */
 566                 req->data1 = (uint64_t)0;
 567         }
 568 }
 569 
 570 
 571 /*
 572  * Convenience code for engineering test of multi-terabyte volumes.
 573  *
 574  * zvol (part of zfs) does not support DKIOCPARTITION but does use EFI
 575  * labels.  This code allocates a simple efi label structure and ioctls
 576  * to extract the size of a zvol.  It only handles the minimal EFI ioctl
 577  * implementation in zvol.
 578  */
 579 
 580 static void
 581 zvol_bsize(char *path, uint64_t *size, const int pnum)
 582 {
 583         struct stat64 stb1, stb2;
 584         struct dk_minfo dkm;
 585         int fd = -1;
 586         int rc;
 587 
 588         if (cl_nodeid || pnum != 0)
 589                 return;
 590 
 591         if ((fd = open(path, O_RDONLY)) < 0) {
 592                 return;
 593         }
 594 
 595         if (stat64("/devices/pseudo/zfs@0:zfs", &stb1) != 0 ||
 596             fstat64(fd, &stb2) != 0 ||
 597             !S_ISCHR(stb1.st_mode) ||
 598             !S_ISCHR(stb2.st_mode) ||
 599             major(stb1.st_rdev) != major(stb2.st_rdev)) {
 600                 (void) close(fd);
 601                 return;
 602         }
 603 
 604         rc = ioctl(fd, DKIOCGMEDIAINFO, (void *)&dkm);
 605         if (rc >= 0) {
 606                 *size = LE_64(dkm.dki_capacity) *
 607                         (dkm.dki_lbsize) / 512;
 608         }
 609 
 610         (void) close(fd);
 611 }
 612 
 613 /* ARGSUSED */
 614 static void
 615 get_bsize(uint64_t raw_fd, uint64_t *size, int *partitionp, char *path)
 616 {
 617         struct nscioc_bsize bsize;
 618 #ifdef DKIOCPARTITION
 619         struct partition64 p64;
 620 #endif
 621         struct dk_cinfo dki_info;
 622         struct vtoc vtoc;
 623         int fd;
 624 
 625         *partitionp = -1;
 626         *size = (uint64_t)0;
 627 
 628         dki_info.dki_partition = (ushort_t)-1;
 629         bsize.dki_info = (uint64_t)(unsigned long)&dki_info;
 630         bsize.vtoc = (uint64_t)(unsigned long)&vtoc;
 631         bsize.raw_fd = raw_fd;
 632         bsize.efi = 0;
 633 
 634         fd = open(rdev, O_RDONLY);
 635         if (fd < 0)
 636                 return;
 637 
 638         if (ioctl(fd, NSCIOC_BSIZE, &bsize) < 0) {
 639                 if (dki_info.dki_partition != (ushort_t)-1) {
 640                         /* assume part# is ok and just the size failed */
 641                         *partitionp = (int)dki_info.dki_partition;
 642 
 643 #ifdef DKIOCPARTITION
 644                         /* see if this is an EFI label */
 645                         bzero(&p64, sizeof (p64));
 646                         p64.p_partno = (uint_t)*partitionp;
 647                         if ((ioctl(fd, DKIOCPARTITION, &p64)) > 0) {
 648                                 *size = (uint64_t)p64.p_size;
 649                         } else {
 650                                 bsize.p64 = (uint64_t)(unsigned long)&p64;
 651                                 bsize.efi = 1;
 652 
 653                                 if (ioctl(fd, NSCIOC_BSIZE, &bsize) < 0) {
 654                                         /* see if this is a zvol */
 655                                         zvol_bsize(path, size, *partitionp);
 656                                 } else {
 657                                         *size = (uint64_t)p64.p_size;
 658                                 }
 659                         }
 660 #endif  /* DKIOCPARTITION */
 661                 }
 662 
 663                 (void) close(fd);
 664                 return;
 665         }
 666 
 667         (void) close(fd);
 668 
 669         *partitionp = (int)dki_info.dki_partition;
 670 
 671         if (vtoc.v_sanity != VTOC_SANE)
 672                 return;
 673 
 674         if (vtoc.v_version != V_VERSION && vtoc.v_version != 0)
 675                 return;
 676 
 677         if (dki_info.dki_partition > V_NUMPAR)
 678                 return;
 679 
 680         *size = (uint64_t)vtoc.v_part[(int)dki_info.dki_partition].p_size;
 681 }
 682 
 683 
 684 static int
 685 iscluster(void)
 686 {
 687         /*
 688          * Find out if we are running in a cluster
 689          */
 690         cl_nodeid = cfg_iscluster();
 691         if (cl_nodeid > 0) {
 692                 return (TRUE);
 693         } else if (cl_nodeid == 0) {
 694                 return (FALSE);
 695         }
 696 
 697         (void) fprintf(stderr, "%s\n",
 698             gettext("nskernd: unable to ascertain environment"));
 699         exit(1);
 700         /* NOTREACHED */
 701 }
 702 
 703 /*
 704  * Runtime Solaris release checking - build release == runtime release
 705  * is always considered success, so only keep entries in the map for
 706  * the special cases.
 707  */
 708 static nsc_release_t nskernd_rel_map[] = {
 709 /*      { "5.10", "5.10" },                     */
 710         { "5.11", "5.10" },
 711         { NULL, NULL }
 712 };
 713 
 714 
 715 #ifdef lint
 716 #define main    nskernd_main
 717 #endif
 718 /* ARGSUSED1 */
 719 int
 720 main(int argc, char *argv[])
 721 {
 722         const char *dir = "/";
 723         struct nskernd data;
 724         struct rlimit rl;
 725         int i, run, rc;
 726         int partition;
 727         char *reqd;
 728         int syncpipe[2];
 729         int startup;
 730 
 731         (void) setlocale(LC_ALL, "");
 732         (void) textdomain("nskernd");
 733 
 734         rc = nsc_check_release(BUILD_REV_STR, nskernd_rel_map, &reqd);
 735         if (rc < 0) {
 736                 (void) fprintf(stderr,
 737                     gettext("nskernd: unable to determine the current "
 738                     "Solaris release: %s\n"), strerror(errno));
 739                 exit(1);
 740         } else if (rc == FALSE) {
 741                 (void) fprintf(stderr,
 742                     gettext("nskernd: incorrect Solaris release "
 743                     "(requires %s)\n"), reqd);
 744                 exit(1);
 745         }
 746 
 747         rc = 0;
 748 
 749         if (argc != 1)
 750                 usage();
 751 
 752         /*
 753          * Usage: <progname> [-g] [-d <seconds to delay>]
 754          */
 755         while ((i = getopt(argc, argv, "gd:")) != EOF) {
 756                 switch (i) {
 757                         case 'g':
 758                                 display_msg = 1;
 759                                 break;
 760                         case 'd':
 761                                 delay_time = atoi(optarg);
 762                                 if (delay_time <= 0) {
 763                                         delay_time = 30;
 764                                 }
 765                                 break;
 766                         default:
 767                                 syslog(LOG_ERR,
 768                                 "Usage: nskernd [-g] [-d <seconds to delay>]");
 769                                 exit(1);
 770                                 break;
 771                 }
 772         }
 773 
 774         if (chroot(dir) < 0) {
 775                 (void) fprintf(stderr, gettext("nskernd: chroot failed: %s\n"),
 776                     strerror(errno));
 777                 exit(1);
 778         }
 779 
 780         if (chdir(dir) < 0) {
 781                 (void) fprintf(stderr, gettext("nskernd: chdir failed: %s\n"),
 782                     strerror(errno));
 783                 exit(1);
 784         }
 785 
 786         /*
 787          * Determine if we are in a Sun Cluster or not, before fork'ing
 788          */
 789         (void) iscluster();
 790 
 791         /*
 792          * create a pipe to synchronise the parent with the
 793          * child just before it enters its service loop.
 794          */
 795         if (pipe(syncpipe) < 0) {
 796                 (void) fprintf(stderr,
 797                     gettext("nskernd: cannot create pipe: %s\n"),
 798                     strerror(errno));
 799                 exit(1);
 800         }
 801         /*
 802          * Fork off a child that becomes the daemon.
 803          */
 804 
 805         if ((rc = fork()) > 0) {
 806                 char c;
 807                 int n;
 808                 (void) close(syncpipe[1]);
 809                 /*
 810                  * wait for the close of the pipe.
 811                  * If we get a char back, indicates good
 812                  * status from child, so exit 0.
 813                  * If we get a zero length read, then the
 814                  * child has failed, so we do too.
 815                  */
 816                 n = read(syncpipe[0], &c, 1);
 817                 exit((n <= 0) ? 1 : 0);
 818         } else if (rc < 0) {
 819                 (void) fprintf(stderr, gettext("nskernd: cannot fork: %s\n"),
 820                     strerror(errno));
 821                 exit(1);
 822         }
 823 
 824         /*
 825          * In child - become daemon.
 826          */
 827 
 828         /* use closefrom(3C) from PSARC/2000/193 when possible */
 829         for (i = 0; i < syncpipe[1]; i++) {
 830                 (void) close(i);
 831         }
 832         closefrom(syncpipe[1] + 1);
 833 
 834         (void) open("/dev/console", O_WRONLY|O_APPEND);
 835         (void) dup(0);
 836         (void) dup(0);
 837         (void) close(0);
 838 
 839         (void) setpgrp();
 840 
 841         /*
 842          * Ignore all signals apart from SIGTERM.
 843          */
 844 
 845         for (i = 1; i < _sys_nsig; i++)
 846                 (void) sigset(i, SIG_IGN);
 847 
 848         (void) sigset(SIGTERM, sighand);
 849 
 850         /*
 851          * Increase the number of fd's that can be open.
 852          */
 853 
 854         rl.rlim_cur = RLIM_INFINITY;
 855         rl.rlim_max = RLIM_INFINITY;
 856         if (setrlimit(RLIMIT_NOFILE, &rl) < 0) {
 857                 (void) fprintf(stderr,
 858                     gettext("nskernd: could not increase RLIMIT_NOFILE: %s\n"),
 859                     strerror(errno));
 860                 (void) fprintf(stderr,
 861                     gettext("nskernd: the maximum number of nsctl open "
 862                     "devices may be reduced\n"));
 863         }
 864 
 865         /*
 866          * Open /dev/nsctl and startup.
 867          */
 868 
 869         nsctl_fd = open(rdev, O_RDONLY);
 870         if (nsctl_fd < 0) {
 871                 (void) fprintf(stderr, gettext("nskernd: unable to open %s\n"),
 872                     rdev);
 873                 exit(1);
 874         }
 875 
 876         bzero(&data, sizeof (data));
 877 
 878         data.command = NSKERND_START;
 879         data.data1 = (uint64_t)cl_nodeid;
 880         run = 1;
 881 
 882         startup = 1;
 883         while (run) {
 884                 rc = ioctl(nsctl_fd, NSCIOC_NSKERND, &data);
 885                 if (rc < 0) {
 886                         /* try and do kernel cleanup and exit */
 887                         if (shutdown()) {
 888                                 run = 0;
 889                         } else {
 890                                 sigterm = 0;
 891                         }
 892 
 893                         (void) fprintf(stderr,
 894                             gettext("nskernd: NSCIOC_NSKERND failed: %s\n"),
 895                             strerror(errno));
 896                         continue;
 897                 } else if (sigterm) {
 898                         /* SIGTERM received - terminate */
 899                         if (data.command != NSKERND_START &&
 900                             (data.command != NSKERND_STOP ||
 901                             data.data1 != (uint64_t)1)) {
 902                                 /* need to do kernel cleanup */
 903                                 if (shutdown()) {
 904                                         run = 0;
 905                                 } else {
 906                                         sigterm = 0;
 907                                         data.command = NSKERND_START;
 908                                         data.data1 = (uint64_t)cl_nodeid;
 909                                 }
 910                         } else {
 911                                 /* just quit */
 912                                 if (canshutdown()) {
 913                                         run = 0;
 914                                 } else {
 915                                         /* cannot shutdown - threads active */
 916                                         sigterm = 0;
 917                                         data.command = NSKERND_START;
 918                                         data.data1 = (uint64_t)cl_nodeid;
 919                                 }
 920                         }
 921                         continue;
 922                 }
 923                 if (startup) {
 924                         char c = 0;
 925                         (void) write(syncpipe[1], &c, 1);
 926                         (void) close(syncpipe[1]);
 927                         startup = 0;
 928                 }
 929                 switch (data.command) {
 930                 case NSKERND_START:     /* (re)start completion */
 931                         if (rc == 1) {
 932                                 (void) fprintf(stderr,
 933                                     gettext("nskernd: already started\n"));
 934                                 run = 0;
 935                         } else if (rc == 2) {
 936                                 (void) fprintf(stderr,
 937                                     gettext("nskernd: stopped by kernel\n"));
 938                                 run = 0;
 939                         }
 940                         data.command = NSKERND_WAIT;
 941                         break;
 942 
 943                 case NSKERND_STOP:      /* kernel telling daemon to stop */
 944                         if (data.data1 != (uint64_t)1) {
 945                                 (void) shutdown();
 946                                 run = 0;
 947                         }
 948                         break;
 949 
 950                 case NSKERND_BSIZE:
 951                         /*
 952                          * kernel requesting partsize
 953                          * data1 - size return
 954                          * data2 - raw_fd (entry)
 955                          *       - partition number (return)
 956                          */
 957                         partition = -1;
 958                         get_bsize(data.data2, &data.data1,
 959                             &partition, data.char1);
 960                         data.data2 = (uint64_t)partition;
 961                         data.command = NSKERND_WAIT;
 962                         break;
 963 
 964                 case NSKERND_NEWLWP:    /* kernel requesting a new LWP */
 965                         newlwp(&data);
 966                         data.command = NSKERND_WAIT;
 967                         break;
 968 
 969                 case NSKERND_LOCK:      /* kernel requesting lock */
 970                         dolock(&data);
 971                         data.command = NSKERND_WAIT;
 972                         break;
 973 
 974                 case NSKERND_WAIT:      /* kernel retrying wait */
 975                         /*
 976                          * the kernel thread can be woken by the dr config
 977                          * utilities (ie cfgadm) therefore we just reissue
 978                          * the wait.
 979                          */
 980                         break;
 981 
 982                 case NSKERND_IIBITMAP:
 983                         rc = log_iibmp_err(data.char1, (int)data.data1);
 984                         data.data1 = (uint64_t)rc;
 985                         data.command = NSKERND_WAIT;
 986                         break;
 987 
 988                 default:
 989                         (void) fprintf(stderr,
 990                                 gettext("nskernd: unknown command %d"),
 991                                 data.command);
 992                         data.command = NSKERND_WAIT;
 993                         break;
 994                 }
 995         }
 996 
 997         (void) close(nsctl_fd);
 998 
 999         return (rc);
1000 }