1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  * Copyright 2011 Joyent, Inc.  All rights reserved.
  26  */
  27 
  28 #include <sys/param.h>
  29 #include <sys/types.h>
  30 #include <sys/sysmacros.h>
  31 #include <sys/systm.h>
  32 #include <sys/errno.h>
  33 #include <sys/vfs.h>
  34 #include <sys/vnode.h>
  35 #include <sys/swap.h>
  36 #include <sys/file.h>
  37 #include <sys/proc.h>
  38 #include <sys/var.h>
  39 #include <sys/uadmin.h>
  40 #include <sys/signal.h>
  41 #include <sys/time.h>
  42 #include <vm/seg_kmem.h>
  43 #include <sys/modctl.h>
  44 #include <sys/callb.h>
  45 #include <sys/dumphdr.h>
  46 #include <sys/debug.h>
  47 #include <sys/ftrace.h>
  48 #include <sys/cmn_err.h>
  49 #include <sys/panic.h>
  50 #include <sys/ddi.h>
  51 #include <sys/sunddi.h>
  52 #include <sys/policy.h>
  53 #include <sys/zone.h>
  54 #include <sys/condvar.h>
  55 #include <sys/thread.h>
  56 #include <sys/sdt.h>
  57 
  58 /*
  59  * Administrivia system call.  We provide this in two flavors: one for calling
  60  * from the system call path (uadmin), and the other for calling from elsewhere
  61  * within the kernel (kadmin).  Callers must beware that certain uadmin cmd
  62  * values (specifically A_SWAPCTL) are only supported by uadmin and not kadmin.
  63  */
  64 
  65 extern ksema_t fsflush_sema;
  66 kmutex_t ualock;
  67 kcondvar_t uacond;
  68 kthread_t *ua_shutdown_thread = NULL;
  69 
  70 int sys_shutdown = 0;
  71 volatile int fastreboot_dryrun = 0;
  72 
  73 /*
  74  * Kill all user processes in said zone.  A special argument of ALL_ZONES is
  75  * passed in when the system as a whole is shutting down.  The lack of per-zone
  76  * process lists is likely to make the following a performance bottleneck on a
  77  * system with many zones.
  78  */
  79 void
  80 killall(zoneid_t zoneid, boolean_t force)
  81 {
  82         proc_t *p;
  83 
  84         ASSERT(zoneid != GLOBAL_ZONEID);
  85         /*
  86          * Kill all processes except kernel daemons and ourself.
  87          * Make a first pass to stop all processes so they won't
  88          * be trying to restart children as we kill them.
  89          */
  90         mutex_enter(&pidlock);
  91         for (p = practive; p != NULL; p = p->p_next) {
  92                 if ((zoneid == ALL_ZONES || p->p_zone->zone_id == zoneid) &&
  93                     p->p_exec != NULLVP &&   /* kernel daemons */
  94                     p->p_as != &kas &&
  95                     p->p_stat != SZOMB) {
  96                         mutex_enter(&p->p_lock);
  97                         p->p_flag |= SNOWAIT;
  98                         sigtoproc(p, NULL, SIGSTOP);
  99                         mutex_exit(&p->p_lock);
 100                 }
 101         }
 102         p = practive;
 103         while (p != NULL) {
 104                 if ((zoneid == ALL_ZONES || p->p_zone->zone_id == zoneid) &&
 105                     p->p_exec != NULLVP &&   /* kernel daemons */
 106                     p->p_as != &kas &&
 107                     p->p_stat != SIDL &&
 108                     p->p_stat != SZOMB) {
 109                         mutex_enter(&p->p_lock);
 110                         if (!force && sigismember(&p->p_sig, SIGKILL)) {
 111                                 mutex_exit(&p->p_lock);
 112                                 p = p->p_next;
 113                         } else {
 114                                 sigtoproc(p, NULL, SIGKILL);
 115                                 mutex_exit(&p->p_lock);
 116                                 (void) cv_reltimedwait(&p->p_srwchan_cv,
 117                                     &pidlock, hz, TR_CLOCK_TICK);
 118                                 p = practive;
 119                         }
 120                 } else {
 121                         p = p->p_next;
 122                 }
 123         }
 124         mutex_exit(&pidlock);
 125 }
 126 
 127 int
 128 kadmin(int cmd, int fcn, void *mdep, cred_t *credp)
 129 {
 130         int error = 0;
 131         char *buf;
 132         size_t buflen = 0;
 133         boolean_t invoke_cb = B_FALSE;
 134 
 135         /*
 136          * We might be called directly by the kernel's fault-handling code, so
 137          * we can't assert that the caller is in the global zone.
 138          */
 139 
 140         /*
 141          * Make sure that cmd is one of the valid <sys/uadmin.h> command codes
 142          * and that we have appropriate privileges for this action.
 143          */
 144         switch (cmd) {
 145         case A_FTRACE:
 146         case A_SHUTDOWN:
 147         case A_REBOOT:
 148         case A_REMOUNT:
 149         case A_FREEZE:
 150         case A_DUMP:
 151         case A_SDTTEST:
 152         case A_CONFIG:
 153                 if (secpolicy_sys_config(credp, B_FALSE) != 0)
 154                         return (EPERM);
 155                 break;
 156 
 157         default:
 158                 return (EINVAL);
 159         }
 160 
 161         /*
 162          * Serialize these operations on ualock.  If it is held, the
 163          * system should shutdown, reboot, or remount shortly, unless there is
 164          * an error.  We need a cv rather than just a mutex because proper
 165          * functioning of A_REBOOT relies on being able to interrupt blocked
 166          * userland callers.
 167          *
 168          * We only clear ua_shutdown_thread after A_REMOUNT or A_CONFIG.
 169          * Other commands should never return.
 170          */
 171         if (cmd == A_SHUTDOWN || cmd == A_REBOOT || cmd == A_REMOUNT ||
 172             cmd == A_CONFIG) {
 173                 mutex_enter(&ualock);
 174                 while (ua_shutdown_thread != NULL) {
 175                         if (cv_wait_sig(&uacond, &ualock) == 0) {
 176                                 /*
 177                                  * If we were interrupted, leave, and handle
 178                                  * the signal (or exit, depending on what
 179                                  * happened)
 180                                  */
 181                                 mutex_exit(&ualock);
 182                                 return (EINTR);
 183                         }
 184                 }
 185                 ua_shutdown_thread = curthread;
 186                 mutex_exit(&ualock);
 187         }
 188 
 189         switch (cmd) {
 190         case A_SHUTDOWN:
 191         {
 192                 proc_t *p = ttoproc(curthread);
 193 
 194                 /*
 195                  * Release (almost) all of our own resources if we are called
 196                  * from a user context, however if we are calling kadmin() from
 197                  * a kernel context then we do not release these resources.
 198                  */
 199                 if (p != &p0) {
 200                         proc_is_exiting(p);
 201                         if ((error = exitlwps(0)) != 0) {
 202                                 /*
 203                                  * Another thread in this process also called
 204                                  * exitlwps().
 205                                  */
 206                                 mutex_enter(&ualock);
 207                                 ua_shutdown_thread = NULL;
 208                                 cv_signal(&uacond);
 209                                 mutex_exit(&ualock);
 210                                 return (error);
 211                         }
 212                         mutex_enter(&p->p_lock);
 213                         p->p_flag |= SNOWAIT;
 214                         sigfillset(&p->p_ignore);
 215                         curthread->t_lwp->lwp_cursig = 0;
 216                         curthread->t_lwp->lwp_extsig = 0;
 217                         if (p->p_exec) {
 218                                 vnode_t *exec_vp = p->p_exec;
 219                                 p->p_exec = NULLVP;
 220                                 mutex_exit(&p->p_lock);
 221                                 VN_RELE(exec_vp);
 222                         } else {
 223                                 mutex_exit(&p->p_lock);
 224                         }
 225 
 226                         pollcleanup();
 227                         closeall(P_FINFO(curproc));
 228                         relvm();
 229 
 230                 } else {
 231                         /*
 232                          * Reset t_cred if not set because much of the
 233                          * filesystem code depends on CRED() being valid.
 234                          */
 235                         if (curthread->t_cred == NULL)
 236                                 curthread->t_cred = kcred;
 237                 }
 238 
 239                 /* indicate shutdown in progress */
 240                 sys_shutdown = 1;
 241 
 242                 /*
 243                  * Communcate that init shouldn't be restarted.
 244                  */
 245                 zone_shutdown_global();
 246 
 247                 killall(ALL_ZONES, B_FALSE);
 248                 /*
 249                  * If we are calling kadmin() from a kernel context then we
 250                  * do not release these resources.
 251                  */
 252                 if (ttoproc(curthread) != &p0) {
 253                         VN_RELE(PTOU(curproc)->u_cdir);
 254                         if (PTOU(curproc)->u_rdir)
 255                                 VN_RELE(PTOU(curproc)->u_rdir);
 256                         if (PTOU(curproc)->u_cwd)
 257                                 refstr_rele(PTOU(curproc)->u_cwd);
 258 
 259                         PTOU(curproc)->u_cdir = rootdir;
 260                         PTOU(curproc)->u_rdir = NULL;
 261                         PTOU(curproc)->u_cwd = NULL;
 262                 }
 263 
 264                 /*
 265                  * Allow the reboot/halt/poweroff code a chance to do
 266                  * anything it needs to whilst we still have filesystems
 267                  * mounted, like loading any modules necessary for later
 268                  * performing the actual poweroff.
 269                  */
 270                 if ((mdep != NULL) && (*(char *)mdep == '/')) {
 271                         buf = i_convert_boot_device_name(mdep, NULL, &buflen);
 272                         mdpreboot(cmd, fcn, buf);
 273                 } else
 274                         mdpreboot(cmd, fcn, mdep);
 275 
 276                 /*
 277                  * Allow fsflush to finish running and then prevent it
 278                  * from ever running again so that vfs_unmountall() and
 279                  * vfs_syncall() can acquire the vfs locks they need.
 280                  */
 281                 sema_p(&fsflush_sema);
 282                 (void) callb_execute_class(CB_CL_UADMIN_PRE_VFS, NULL);
 283 
 284                 vfs_unmountall();
 285                 (void) VFS_MOUNTROOT(rootvfs, ROOT_UNMOUNT);
 286                 vfs_syncall();
 287 
 288                 dump_ereports();
 289                 dump_messages();
 290 
 291                 invoke_cb = B_TRUE;
 292 
 293                 /* FALLTHROUGH */
 294         }
 295 
 296         case A_REBOOT:
 297                 if ((mdep != NULL) && (*(char *)mdep == '/')) {
 298                         buf = i_convert_boot_device_name(mdep, NULL, &buflen);
 299                         mdboot(cmd, fcn, buf, invoke_cb);
 300                 } else
 301                         mdboot(cmd, fcn, mdep, invoke_cb);
 302                 /* no return expected */
 303                 break;
 304 
 305         case A_CONFIG:
 306                 switch (fcn) {
 307                 case AD_UPDATE_BOOT_CONFIG:
 308 #ifndef __sparc
 309                 {
 310                         extern void fastboot_update_config(const char *);
 311 
 312                         fastboot_update_config(mdep);
 313                 }
 314 #endif
 315 
 316                         break;
 317                 }
 318                 /* Let other threads enter the shutdown path now */
 319                 mutex_enter(&ualock);
 320                 ua_shutdown_thread = NULL;
 321                 cv_signal(&uacond);
 322                 mutex_exit(&ualock);
 323                 break;
 324 
 325         case A_REMOUNT:
 326                 (void) VFS_MOUNTROOT(rootvfs, ROOT_REMOUNT);
 327                 /* Let other threads enter the shutdown path now */
 328                 mutex_enter(&ualock);
 329                 ua_shutdown_thread = NULL;
 330                 cv_signal(&uacond);
 331                 mutex_exit(&ualock);
 332                 break;
 333 
 334         case A_FREEZE:
 335         {
 336                 /*
 337                  * This is the entrypoint for all suspend/resume actions.
 338                  */
 339                 extern int cpr(int, void *);
 340 
 341                 if (modload("misc", "cpr") == -1)
 342                         return (ENOTSUP);
 343                 /* Let the CPR module decide what to do with mdep */
 344                 error = cpr(fcn, mdep);
 345                 break;
 346         }
 347 
 348         case A_FTRACE:
 349         {
 350                 switch (fcn) {
 351                 case AD_FTRACE_START:
 352                         (void) FTRACE_START();
 353                         break;
 354                 case AD_FTRACE_STOP:
 355                         (void) FTRACE_STOP();
 356                         break;
 357                 default:
 358                         error = EINVAL;
 359                 }
 360                 break;
 361         }
 362 
 363         case A_DUMP:
 364         {
 365                 if (fcn == AD_NOSYNC) {
 366                         in_sync = 1;
 367                         break;
 368                 }
 369 
 370                 panic_bootfcn = fcn;
 371                 panic_forced = 1;
 372 
 373                 if ((mdep != NULL) && (*(char *)mdep == '/')) {
 374                         panic_bootstr = i_convert_boot_device_name(mdep,
 375                             NULL, &buflen);
 376                 } else
 377                         panic_bootstr = mdep;
 378 
 379 #ifndef __sparc
 380                 extern void fastboot_update_and_load(int, char *);
 381 
 382                 fastboot_update_and_load(fcn, mdep);
 383 #endif
 384 
 385                 panic("forced crash dump initiated at user request");
 386                 /*NOTREACHED*/
 387         }
 388 
 389         case A_SDTTEST:
 390         {
 391                 DTRACE_PROBE7(test, int, 1, int, 2, int, 3, int, 4, int, 5,
 392                     int, 6, int, 7);
 393                 break;
 394         }
 395 
 396         default:
 397                 error = EINVAL;
 398         }
 399 
 400         return (error);
 401 }
 402 
 403 int
 404 uadmin(int cmd, int fcn, uintptr_t mdep)
 405 {
 406         int error = 0, rv = 0;
 407         size_t nbytes = 0;
 408         cred_t *credp = CRED();
 409         char *bootargs = NULL;
 410         int reset_status = 0;
 411 
 412         if (cmd == A_SHUTDOWN && fcn == AD_FASTREBOOT_DRYRUN) {
 413                 ddi_walk_devs(ddi_root_node(), check_driver_quiesce,
 414                     &reset_status);
 415                 if (reset_status != 0)
 416                         return (EIO);
 417                 else
 418                         return (0);
 419         }
 420 
 421         /*
 422          * The swapctl system call doesn't have its own entry point: it uses
 423          * uadmin as a wrapper so we just call it directly from here.
 424          */
 425         if (cmd == A_SWAPCTL) {
 426                 if (get_udatamodel() == DATAMODEL_NATIVE)
 427                         error = swapctl(fcn, (void *)mdep, &rv);
 428 #if defined(_SYSCALL32_IMPL)
 429                 else
 430                         error = swapctl32(fcn, (void *)mdep, &rv);
 431 #endif /* _SYSCALL32_IMPL */
 432                 return (error ? set_errno(error) : rv);
 433         }
 434 
 435         /*
 436          * Certain subcommands intepret a non-NULL mdep value as a pointer to
 437          * a boot string.  We pull that in as bootargs, if applicable.
 438          */
 439         if (mdep != NULL &&
 440             (cmd == A_SHUTDOWN || cmd == A_REBOOT || cmd == A_DUMP ||
 441             cmd == A_FREEZE || cmd == A_CONFIG)) {
 442                 bootargs = kmem_zalloc(BOOTARGS_MAX, KM_SLEEP);
 443                 if ((error = copyinstr((const char *)mdep, bootargs,
 444                     BOOTARGS_MAX, &nbytes)) != 0) {
 445                         kmem_free(bootargs, BOOTARGS_MAX);
 446                         return (set_errno(error));
 447                 }
 448         }
 449 
 450         /*
 451          * Invoke the appropriate kadmin() routine.
 452          */
 453         if (getzoneid() != GLOBAL_ZONEID)
 454                 error = zone_kadmin(cmd, fcn, bootargs, credp);
 455         else
 456                 error = kadmin(cmd, fcn, bootargs, credp);
 457 
 458         if (bootargs != NULL)
 459                 kmem_free(bootargs, BOOTARGS_MAX);
 460         return (error ? set_errno(error) : 0);
 461 }