1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  * Copyright 2013 Joyent, Inc.  All rights reserved.
  26  */
  27 
  28 #include <sys/param.h>
  29 #include <sys/types.h>
  30 #include <sys/sysmacros.h>
  31 #include <sys/systm.h>
  32 #include <sys/errno.h>
  33 #include <sys/vfs.h>
  34 #include <sys/vnode.h>
  35 #include <sys/swap.h>
  36 #include <sys/file.h>
  37 #include <sys/proc.h>
  38 #include <sys/var.h>
  39 #include <sys/uadmin.h>
  40 #include <sys/signal.h>
  41 #include <sys/time.h>
  42 #include <vm/seg_kmem.h>
  43 #include <sys/modctl.h>
  44 #include <sys/callb.h>
  45 #include <sys/dumphdr.h>
  46 #include <sys/debug.h>
  47 #include <sys/ftrace.h>
  48 #include <sys/cmn_err.h>
  49 #include <sys/panic.h>
  50 #include <sys/ddi.h>
  51 #include <sys/ddi_periodic.h>
  52 #include <sys/sunddi.h>
  53 #include <sys/policy.h>
  54 #include <sys/zone.h>
  55 #include <sys/condvar.h>
  56 #include <sys/thread.h>
  57 #include <sys/sdt.h>
  58 
  59 /*
  60  * Administrivia system call.  We provide this in two flavors: one for calling
  61  * from the system call path (uadmin), and the other for calling from elsewhere
  62  * within the kernel (kadmin).  Callers must beware that certain uadmin cmd
  63  * values (specifically A_SWAPCTL) are only supported by uadmin and not kadmin.
  64  */
  65 
  66 extern ksema_t fsflush_sema;
  67 kmutex_t ualock;
  68 kcondvar_t uacond;
  69 kthread_t *ua_shutdown_thread = NULL;
  70 
  71 int sys_shutdown = 0;
  72 volatile int fastreboot_dryrun = 0;
  73 
  74 /*
  75  * Kill all user processes in said zone.  A special argument of ALL_ZONES is
  76  * passed in when the system as a whole is shutting down.  The lack of per-zone
  77  * process lists is likely to make the following a performance bottleneck on a
  78  * system with many zones.
  79  */
  80 void
  81 killall(zoneid_t zoneid, boolean_t force)
  82 {
  83         proc_t *p;
  84 
  85         ASSERT(zoneid != GLOBAL_ZONEID);
  86         /*
  87          * Kill all processes except kernel daemons and ourself.
  88          * Make a first pass to stop all processes so they won't
  89          * be trying to restart children as we kill them.
  90          */
  91         mutex_enter(&pidlock);
  92         for (p = practive; p != NULL; p = p->p_next) {
  93                 if ((zoneid == ALL_ZONES || p->p_zone->zone_id == zoneid) &&
  94                     p->p_exec != NULLVP &&   /* kernel daemons */
  95                     p->p_as != &kas &&
  96                     p->p_stat != SZOMB) {
  97                         mutex_enter(&p->p_lock);
  98                         p->p_flag |= SNOWAIT;
  99                         sigtoproc(p, NULL, SIGSTOP);
 100                         mutex_exit(&p->p_lock);
 101                 }
 102         }
 103         p = practive;
 104         while (p != NULL) {
 105                 if ((zoneid == ALL_ZONES || p->p_zone->zone_id == zoneid) &&
 106                     p->p_exec != NULLVP &&   /* kernel daemons */
 107                     p->p_as != &kas &&
 108                     p->p_stat != SIDL &&
 109                     p->p_stat != SZOMB) {
 110                         mutex_enter(&p->p_lock);
 111                         if (!force && sigismember(&p->p_sig, SIGKILL)) {
 112                                 mutex_exit(&p->p_lock);
 113                                 p = p->p_next;
 114                         } else {
 115                                 sigtoproc(p, NULL, SIGKILL);
 116                                 mutex_exit(&p->p_lock);
 117                                 (void) cv_reltimedwait(&p->p_srwchan_cv,
 118                                     &pidlock, hz, TR_CLOCK_TICK);
 119                                 p = practive;
 120                         }
 121                 } else {
 122                         p = p->p_next;
 123                 }
 124         }
 125         mutex_exit(&pidlock);
 126 }
 127 
 128 int
 129 kadmin(int cmd, int fcn, void *mdep, cred_t *credp)
 130 {
 131         int error = 0;
 132         char *buf;
 133         size_t buflen = 0;
 134         boolean_t invoke_cb = B_FALSE;
 135 
 136         /*
 137          * We might be called directly by the kernel's fault-handling code, so
 138          * we can't assert that the caller is in the global zone.
 139          */
 140 
 141         /*
 142          * Make sure that cmd is one of the valid <sys/uadmin.h> command codes
 143          * and that we have appropriate privileges for this action.
 144          */
 145         switch (cmd) {
 146         case A_FTRACE:
 147         case A_SHUTDOWN:
 148         case A_REBOOT:
 149         case A_REMOUNT:
 150         case A_FREEZE:
 151         case A_DUMP:
 152         case A_SDTTEST:
 153         case A_CONFIG:
 154                 if (secpolicy_sys_config(credp, B_FALSE) != 0)
 155                         return (EPERM);
 156                 break;
 157 
 158         default:
 159                 return (EINVAL);
 160         }
 161 
 162         /*
 163          * Serialize these operations on ualock.  If it is held, the
 164          * system should shutdown, reboot, or remount shortly, unless there is
 165          * an error.  We need a cv rather than just a mutex because proper
 166          * functioning of A_REBOOT relies on being able to interrupt blocked
 167          * userland callers.
 168          *
 169          * We only clear ua_shutdown_thread after A_REMOUNT or A_CONFIG.
 170          * Other commands should never return.
 171          */
 172         if (cmd == A_SHUTDOWN || cmd == A_REBOOT || cmd == A_REMOUNT ||
 173             cmd == A_CONFIG) {
 174                 mutex_enter(&ualock);
 175                 while (ua_shutdown_thread != NULL) {
 176                         if (cv_wait_sig(&uacond, &ualock) == 0) {
 177                                 /*
 178                                  * If we were interrupted, leave, and handle
 179                                  * the signal (or exit, depending on what
 180                                  * happened)
 181                                  */
 182                                 mutex_exit(&ualock);
 183                                 return (EINTR);
 184                         }
 185                 }
 186                 ua_shutdown_thread = curthread;
 187                 mutex_exit(&ualock);
 188         }
 189 
 190         switch (cmd) {
 191         case A_SHUTDOWN:
 192         {
 193                 proc_t *p = ttoproc(curthread);
 194 
 195                 /*
 196                  * Release (almost) all of our own resources if we are called
 197                  * from a user context, however if we are calling kadmin() from
 198                  * a kernel context then we do not release these resources.
 199                  */
 200                 if (p != &p0) {
 201                         proc_is_exiting(p);
 202                         if ((error = exitlwps(0)) != 0) {
 203                                 /*
 204                                  * Another thread in this process also called
 205                                  * exitlwps().
 206                                  */
 207                                 mutex_enter(&ualock);
 208                                 ua_shutdown_thread = NULL;
 209                                 cv_signal(&uacond);
 210                                 mutex_exit(&ualock);
 211                                 return (error);
 212                         }
 213                         mutex_enter(&p->p_lock);
 214                         p->p_flag |= SNOWAIT;
 215                         sigfillset(&p->p_ignore);
 216                         curthread->t_lwp->lwp_cursig = 0;
 217                         curthread->t_lwp->lwp_extsig = 0;
 218                         if (p->p_exec) {
 219                                 vnode_t *exec_vp = p->p_exec;
 220                                 p->p_exec = NULLVP;
 221                                 mutex_exit(&p->p_lock);
 222                                 VN_RELE(exec_vp);
 223                         } else {
 224                                 mutex_exit(&p->p_lock);
 225                         }
 226 
 227                         pollcleanup();
 228                         closeall(P_FINFO(curproc));
 229                         relvm();
 230 
 231                 } else {
 232                         /*
 233                          * Reset t_cred if not set because much of the
 234                          * filesystem code depends on CRED() being valid.
 235                          */
 236                         if (curthread->t_cred == NULL)
 237                                 curthread->t_cred = kcred;
 238                 }
 239 
 240                 /* indicate shutdown in progress */
 241                 sys_shutdown = 1;
 242 
 243                 /*
 244                  * Communcate that init shouldn't be restarted.
 245                  */
 246                 zone_shutdown_global();
 247 
 248                 killall(ALL_ZONES, B_FALSE);
 249                 /*
 250                  * If we are calling kadmin() from a kernel context then we
 251                  * do not release these resources.
 252                  */
 253                 if (ttoproc(curthread) != &p0) {
 254                         VN_RELE(PTOU(curproc)->u_cdir);
 255                         if (PTOU(curproc)->u_rdir)
 256                                 VN_RELE(PTOU(curproc)->u_rdir);
 257                         if (PTOU(curproc)->u_cwd)
 258                                 refstr_rele(PTOU(curproc)->u_cwd);
 259 
 260                         PTOU(curproc)->u_cdir = rootdir;
 261                         PTOU(curproc)->u_rdir = NULL;
 262                         PTOU(curproc)->u_cwd = NULL;
 263                 }
 264 
 265                 /*
 266                  * Allow the reboot/halt/poweroff code a chance to do
 267                  * anything it needs to whilst we still have filesystems
 268                  * mounted, like loading any modules necessary for later
 269                  * performing the actual poweroff.
 270                  */
 271                 if ((mdep != NULL) && (*(char *)mdep == '/')) {
 272                         buf = i_convert_boot_device_name(mdep, NULL, &buflen);
 273                         mdpreboot(cmd, fcn, buf);
 274                 } else
 275                         mdpreboot(cmd, fcn, mdep);
 276 
 277                 /*
 278                  * Allow fsflush to finish running and then prevent it
 279                  * from ever running again so that vfs_unmountall() and
 280                  * vfs_syncall() can acquire the vfs locks they need.
 281                  */
 282                 sema_p(&fsflush_sema);
 283                 (void) callb_execute_class(CB_CL_UADMIN_PRE_VFS, NULL);
 284 
 285                 vfs_unmountall();
 286                 (void) VFS_MOUNTROOT(rootvfs, ROOT_UNMOUNT);
 287                 vfs_syncall();
 288 
 289                 /*
 290                  * Check for (and unregister) any DDI periodic handlers that
 291                  * still exist, as they most likely constitute resource leaks:
 292                  */
 293                 ddi_periodic_fini();
 294 
 295                 dump_ereports();
 296                 dump_messages();
 297 
 298                 invoke_cb = B_TRUE;
 299 
 300                 /* FALLTHROUGH */
 301         }
 302 
 303         case A_REBOOT:
 304                 if ((mdep != NULL) && (*(char *)mdep == '/')) {
 305                         buf = i_convert_boot_device_name(mdep, NULL, &buflen);
 306                         mdboot(cmd, fcn, buf, invoke_cb);
 307                 } else
 308                         mdboot(cmd, fcn, mdep, invoke_cb);
 309                 /* no return expected */
 310                 break;
 311 
 312         case A_CONFIG:
 313                 switch (fcn) {
 314                 case AD_UPDATE_BOOT_CONFIG:
 315 #ifndef __sparc
 316                 {
 317                         extern void fastboot_update_config(const char *);
 318 
 319                         fastboot_update_config(mdep);
 320                 }
 321 #endif
 322 
 323                         break;
 324                 }
 325                 /* Let other threads enter the shutdown path now */
 326                 mutex_enter(&ualock);
 327                 ua_shutdown_thread = NULL;
 328                 cv_signal(&uacond);
 329                 mutex_exit(&ualock);
 330                 break;
 331 
 332         case A_REMOUNT:
 333                 (void) VFS_MOUNTROOT(rootvfs, ROOT_REMOUNT);
 334                 /* Let other threads enter the shutdown path now */
 335                 mutex_enter(&ualock);
 336                 ua_shutdown_thread = NULL;
 337                 cv_signal(&uacond);
 338                 mutex_exit(&ualock);
 339                 break;
 340 
 341         case A_FREEZE:
 342         {
 343                 /*
 344                  * This is the entrypoint for all suspend/resume actions.
 345                  */
 346                 extern int cpr(int, void *);
 347 
 348                 if (modload("misc", "cpr") == -1)
 349                         return (ENOTSUP);
 350                 /* Let the CPR module decide what to do with mdep */
 351                 error = cpr(fcn, mdep);
 352                 break;
 353         }
 354 
 355         case A_FTRACE:
 356         {
 357                 switch (fcn) {
 358                 case AD_FTRACE_START:
 359                         (void) FTRACE_START();
 360                         break;
 361                 case AD_FTRACE_STOP:
 362                         (void) FTRACE_STOP();
 363                         break;
 364                 default:
 365                         error = EINVAL;
 366                 }
 367                 break;
 368         }
 369 
 370         case A_DUMP:
 371         {
 372                 if (fcn == AD_NOSYNC) {
 373                         in_sync = 1;
 374                         break;
 375                 }
 376 
 377                 panic_bootfcn = fcn;
 378                 panic_forced = 1;
 379 
 380                 if ((mdep != NULL) && (*(char *)mdep == '/')) {
 381                         panic_bootstr = i_convert_boot_device_name(mdep,
 382                             NULL, &buflen);
 383                 } else
 384                         panic_bootstr = mdep;
 385 
 386 #ifndef __sparc
 387                 extern void fastboot_update_and_load(int, char *);
 388 
 389                 fastboot_update_and_load(fcn, mdep);
 390 #endif
 391 
 392                 panic("forced crash dump initiated at user request");
 393                 /*NOTREACHED*/
 394         }
 395 
 396         case A_SDTTEST:
 397         {
 398                 DTRACE_PROBE7(test, int, 1, int, 2, int, 3, int, 4, int, 5,
 399                     int, 6, int, 7);
 400                 break;
 401         }
 402 
 403         default:
 404                 error = EINVAL;
 405         }
 406 
 407         return (error);
 408 }
 409 
 410 int
 411 uadmin(int cmd, int fcn, uintptr_t mdep)
 412 {
 413         int error = 0, rv = 0;
 414         size_t nbytes = 0;
 415         cred_t *credp = CRED();
 416         char *bootargs = NULL;
 417         int reset_status = 0;
 418 
 419         if (cmd == A_SHUTDOWN && fcn == AD_FASTREBOOT_DRYRUN) {
 420                 ddi_walk_devs(ddi_root_node(), check_driver_quiesce,
 421                     &reset_status);
 422                 if (reset_status != 0)
 423                         return (EIO);
 424                 else
 425                         return (0);
 426         }
 427 
 428         /*
 429          * The swapctl system call doesn't have its own entry point: it uses
 430          * uadmin as a wrapper so we just call it directly from here.
 431          */
 432         if (cmd == A_SWAPCTL) {
 433                 if (get_udatamodel() == DATAMODEL_NATIVE)
 434                         error = swapctl(fcn, (void *)mdep, &rv);
 435 #if defined(_SYSCALL32_IMPL)
 436                 else
 437                         error = swapctl32(fcn, (void *)mdep, &rv);
 438 #endif /* _SYSCALL32_IMPL */
 439                 return (error ? set_errno(error) : rv);
 440         }
 441 
 442         /*
 443          * Certain subcommands intepret a non-NULL mdep value as a pointer to
 444          * a boot string.  We pull that in as bootargs, if applicable.
 445          */
 446         if (mdep != NULL &&
 447             (cmd == A_SHUTDOWN || cmd == A_REBOOT || cmd == A_DUMP ||
 448             cmd == A_FREEZE || cmd == A_CONFIG)) {
 449                 bootargs = kmem_zalloc(BOOTARGS_MAX, KM_SLEEP);
 450                 if ((error = copyinstr((const char *)mdep, bootargs,
 451                     BOOTARGS_MAX, &nbytes)) != 0) {
 452                         kmem_free(bootargs, BOOTARGS_MAX);
 453                         return (set_errno(error));
 454                 }
 455         }
 456 
 457         /*
 458          * Invoke the appropriate kadmin() routine.
 459          */
 460         if (getzoneid() != GLOBAL_ZONEID)
 461                 error = zone_kadmin(cmd, fcn, bootargs, credp);
 462         else
 463                 error = kadmin(cmd, fcn, bootargs, credp);
 464 
 465         if (bootargs != NULL)
 466                 kmem_free(bootargs, BOOTARGS_MAX);
 467         return (error ? set_errno(error) : 0);
 468 }