1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 #include <sys/types.h>
  27 #include <sys/errno.h>
  28 #include <sys/cpuvar.h>
  29 #include <sys/vfs.h>
  30 #include <sys/vnode.h>
  31 #include <sys/pathname.h>
  32 #include <sys/callb.h>
  33 #include <sys/fs/ufs_inode.h>
  34 #include <vm/anon.h>
  35 #include <sys/fs/swapnode.h>      /* for swapfs_minfree */
  36 #include <sys/kmem.h>
  37 #include <sys/cpr.h>
  38 #include <sys/conf.h>
  39 #include <sys/machclock.h>
  40 
  41 /*
  42  * CPR miscellaneous support routines
  43  */
  44 #define cpr_open(path, mode,  vpp)      (vn_open(path, UIO_SYSSPACE, \
  45                 mode, 0600, vpp, CRCREAT, 0))
  46 #define cpr_rdwr(rw, vp, basep, cnt)    (vn_rdwr(rw, vp,  (caddr_t)(basep), \
  47                 cnt, 0LL, UIO_SYSSPACE, 0, (rlim64_t)MAXOFF_T, CRED(), \
  48                 (ssize_t *)NULL))
  49 
  50 extern void clkset(time_t);
  51 extern cpu_t *i_cpr_bootcpu(void);
  52 extern caddr_t i_cpr_map_setup(void);
  53 extern void i_cpr_free_memory_resources(void);
  54 
  55 extern kmutex_t cpr_slock;
  56 extern size_t cpr_buf_size;
  57 extern char *cpr_buf;
  58 extern size_t cpr_pagedata_size;
  59 extern char *cpr_pagedata;
  60 extern int cpr_bufs_allocated;
  61 extern int cpr_bitmaps_allocated;
  62 
  63 #if defined(__sparc)
  64 static struct cprconfig cprconfig;
  65 static int cprconfig_loaded = 0;
  66 static int cpr_statefile_ok(vnode_t *, int);
  67 static int cpr_p_online(cpu_t *, int);
  68 static void cpr_save_mp_state(void);
  69 #endif
  70 
  71 int cpr_is_ufs(struct vfs *);
  72 int cpr_is_zfs(struct vfs *);
  73 
  74 char cpr_default_path[] = CPR_DEFAULT;
  75 
  76 #define COMPRESS_PERCENT 40     /* approx compression ratio in percent */
  77 #define SIZE_RATE       115     /* increase size by 15% */
  78 #define INTEGRAL        100     /* for integer math */
  79 
  80 
  81 /*
  82  * cmn_err() followed by a 1/4 second delay; this gives the
  83  * logging service a chance to flush messages and helps avoid
  84  * intermixing output from prom_printf().
  85  */
  86 /*PRINTFLIKE2*/
  87 void
  88 cpr_err(int ce, const char *fmt, ...)
  89 {
  90         va_list adx;
  91 
  92         va_start(adx, fmt);
  93         vcmn_err(ce, fmt, adx);
  94         va_end(adx);
  95         drv_usecwait(MICROSEC >> 2);
  96 }
  97 
  98 
  99 int
 100 cpr_init(int fcn)
 101 {
 102         /*
 103          * Allow only one suspend/resume process.
 104          */
 105         if (mutex_tryenter(&cpr_slock) == 0)
 106                 return (EBUSY);
 107 
 108         CPR->c_flags = 0;
 109         CPR->c_substate = 0;
 110         CPR->c_cprboot_magic = 0;
 111         CPR->c_alloc_cnt = 0;
 112 
 113         CPR->c_fcn = fcn;
 114         if (fcn == AD_CPR_REUSABLE)
 115                 CPR->c_flags |= C_REUSABLE;
 116         else
 117                 CPR->c_flags |= C_SUSPENDING;
 118         if (fcn == AD_SUSPEND_TO_RAM || fcn == DEV_SUSPEND_TO_RAM) {
 119                 return (0);
 120         }
 121 #if defined(__sparc)
 122         if (fcn != AD_CPR_NOCOMPRESS && fcn != AD_CPR_TESTNOZ)
 123                 CPR->c_flags |= C_COMPRESSING;
 124         /*
 125          * reserve CPR_MAXCONTIG virtual pages for cpr_dump()
 126          */
 127         CPR->c_mapping_area = i_cpr_map_setup();
 128         if (CPR->c_mapping_area == 0) {              /* no space in kernelmap */
 129                 cpr_err(CE_CONT, "Unable to alloc from kernelmap.\n");
 130                 mutex_exit(&cpr_slock);
 131                 return (EAGAIN);
 132         }
 133         if (cpr_debug & CPR_DEBUG3)
 134                 cpr_err(CE_CONT, "Reserved virtual range from 0x%p for writing "
 135                     "kas\n", (void *)CPR->c_mapping_area);
 136 #endif
 137 
 138         return (0);
 139 }
 140 
 141 /*
 142  * This routine releases any resources used during the checkpoint.
 143  */
 144 void
 145 cpr_done(void)
 146 {
 147         cpr_stat_cleanup();
 148         i_cpr_bitmap_cleanup();
 149 
 150         /*
 151          * Free pages used by cpr buffers.
 152          */
 153         if (cpr_buf) {
 154                 kmem_free(cpr_buf, cpr_buf_size);
 155                 cpr_buf = NULL;
 156         }
 157         if (cpr_pagedata) {
 158                 kmem_free(cpr_pagedata, cpr_pagedata_size);
 159                 cpr_pagedata = NULL;
 160         }
 161 
 162         i_cpr_free_memory_resources();
 163         mutex_exit(&cpr_slock);
 164         cpr_err(CE_CONT, "System has been resumed.\n");
 165 }
 166 
 167 
 168 #if defined(__sparc)
 169 /*
 170  * reads config data into cprconfig
 171  */
 172 static int
 173 cpr_get_config(void)
 174 {
 175         static char config_path[] = CPR_CONFIG;
 176         struct cprconfig *cf = &cprconfig;
 177         struct vnode *vp;
 178         char *fmt;
 179         int err;
 180 
 181         if (cprconfig_loaded)
 182                 return (0);
 183 
 184         fmt = "cannot %s config file \"%s\", error %d\n";
 185         if (err = vn_open(config_path, UIO_SYSSPACE, FREAD, 0, &vp, 0, 0)) {
 186                 cpr_err(CE_CONT, fmt, "open", config_path, err);
 187                 return (err);
 188         }
 189 
 190         err = cpr_rdwr(UIO_READ, vp, cf, sizeof (*cf));
 191         (void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED(), NULL);
 192         VN_RELE(vp);
 193         if (err) {
 194                 cpr_err(CE_CONT, fmt, "read", config_path, err);
 195                 return (err);
 196         }
 197 
 198         if (cf->cf_magic == CPR_CONFIG_MAGIC)
 199                 cprconfig_loaded = 1;
 200         else {
 201                 cpr_err(CE_CONT, "invalid config file \"%s\", "
 202                     "rerun pmconfig(1M)\n", config_path);
 203                 err = EINVAL;
 204         }
 205 
 206         return (err);
 207 }
 208 
 209 
 210 /*
 211  * concat fs and path fields of the cprconfig structure;
 212  * returns pointer to the base of static data
 213  */
 214 static char *
 215 cpr_cprconfig_to_path(void)
 216 {
 217         static char full_path[MAXNAMELEN];
 218         struct cprconfig *cf = &cprconfig;
 219         char *ptr;
 220 
 221         /*
 222          * build /fs/path without extra '/'
 223          */
 224         (void) strcpy(full_path, cf->cf_fs);
 225         if (strcmp(cf->cf_fs, "/"))
 226                 (void) strcat(full_path, "/");
 227         ptr = cf->cf_path;
 228         if (*ptr == '/')
 229                 ptr++;
 230         (void) strcat(full_path, ptr);
 231         return (full_path);
 232 }
 233 
 234 
 235 /*
 236  * Verify that the information in the configuration file regarding the
 237  * location for the statefile is still valid, depending on cf_type.
 238  * for CFT_UFS, cf_fs must still be a mounted filesystem, it must be
 239  *      mounted on the same device as when pmconfig was last run,
 240  *      and the translation of that device to a node in the prom's
 241  *      device tree must be the same as when pmconfig was last run.
 242  * for CFT_SPEC and CFT_ZVOL, cf_path must be the path to a block
 243  *      special file, it must have no file system mounted on it,
 244  *      and the translation of that device to a node in the prom's
 245  *      device tree must be the same as when pmconfig was last run.
 246  */
 247 static int
 248 cpr_verify_statefile_path(void)
 249 {
 250         struct cprconfig *cf = &cprconfig;
 251         static const char long_name[] = "Statefile pathname is too long.\n";
 252         static const char lookup_fmt[] = "Lookup failed for "
 253             "cpr statefile device %s.\n";
 254         static const char path_chg_fmt[] = "Device path for statefile "
 255             "has changed from %s to %s.\t%s\n";
 256         static const char rerun[] = "Please rerun pmconfig(1m).";
 257         struct vfs *vfsp = NULL, *vfsp_save = rootvfs;
 258         ufsvfs_t *ufsvfsp = (ufsvfs_t *)rootvfs->vfs_data;
 259         ufsvfs_t *ufsvfsp_save = ufsvfsp;
 260         int error;
 261         struct vnode *vp;
 262         char *slash, *tail, *longest;
 263         char *errstr;
 264         int found = 0;
 265         union {
 266                 char un_devpath[OBP_MAXPATHLEN];
 267                 char un_sfpath[MAXNAMELEN];
 268         } un;
 269 #define devpath un.un_devpath
 270 #define sfpath  un.un_sfpath
 271 
 272         ASSERT(cprconfig_loaded);
 273         /*
 274          * We need not worry about locking or the timing of releasing
 275          * the vnode, since we are single-threaded now.
 276          */
 277 
 278         switch (cf->cf_type) {
 279         case CFT_SPEC:
 280                 error = i_devname_to_promname(cf->cf_devfs, devpath,
 281                     OBP_MAXPATHLEN);
 282                 if (error || strcmp(devpath, cf->cf_dev_prom)) {
 283                         cpr_err(CE_CONT, path_chg_fmt,
 284                             cf->cf_dev_prom, devpath, rerun);
 285                         return (error);
 286                 }
 287                 /*FALLTHROUGH*/
 288         case CFT_ZVOL:
 289                 if (strlen(cf->cf_path) > sizeof (sfpath)) {
 290                         cpr_err(CE_CONT, long_name);
 291                         return (ENAMETOOLONG);
 292                 }
 293                 if ((error = lookupname(cf->cf_devfs,
 294                     UIO_SYSSPACE, FOLLOW, NULLVPP, &vp)) != 0) {
 295                         cpr_err(CE_CONT, lookup_fmt, cf->cf_devfs);
 296                         return (error);
 297                 }
 298                 if (vp->v_type != VBLK)
 299                         errstr = "statefile must be a block device";
 300                 else if (vfs_devismounted(vp->v_rdev))
 301                         errstr = "statefile device must not "
 302                             "have a file system mounted on it";
 303                 else if (IS_SWAPVP(vp))
 304                         errstr = "statefile device must not "
 305                             "be configured as swap file";
 306                 else
 307                         errstr = NULL;
 308 
 309                 VN_RELE(vp);
 310                 if (errstr) {
 311                         cpr_err(CE_CONT, "%s.\n", errstr);
 312                         return (ENOTSUP);
 313                 }
 314 
 315                 return (error);
 316         case CFT_UFS:
 317                 break;          /* don't indent all the original code */
 318         default:
 319                 cpr_err(CE_PANIC, "invalid cf_type");
 320         }
 321 
 322         /*
 323          * The original code for UFS statefile
 324          */
 325         if (strlen(cf->cf_fs) + strlen(cf->cf_path) + 2 > sizeof (sfpath)) {
 326                 cpr_err(CE_CONT, long_name);
 327                 return (ENAMETOOLONG);
 328         }
 329 
 330         bzero(sfpath, sizeof (sfpath));
 331         (void) strcpy(sfpath, cpr_cprconfig_to_path());
 332 
 333         if (*sfpath != '/') {
 334                 cpr_err(CE_CONT, "Statefile pathname %s "
 335                     "must begin with a /\n", sfpath);
 336                 return (EINVAL);
 337         }
 338 
 339         /*
 340          * Find the longest prefix of the statefile pathname which
 341          * is the mountpoint of a filesystem.  This string must
 342          * match the cf_fs field we read from the config file.  Other-
 343          * wise the user has changed things without running pmconfig.
 344          */
 345         tail = longest = sfpath + 1;    /* pt beyond the leading "/" */
 346         while ((slash = strchr(tail, '/')) != NULL) {
 347                 *slash = '\0';    /* temporarily terminate the string */
 348                 if ((error = lookupname(sfpath,
 349                     UIO_SYSSPACE, FOLLOW, NULLVPP, &vp)) != 0) {
 350                         *slash = '/';
 351                         cpr_err(CE_CONT, "A directory in the "
 352                             "statefile path %s was not found.\n", sfpath);
 353                         VN_RELE(vp);
 354 
 355                         return (error);
 356                 }
 357 
 358                 vfs_list_read_lock();
 359                 vfsp = rootvfs;
 360                 do {
 361                         ufsvfsp = (struct ufsvfs *)vfsp->vfs_data;
 362                         if (ufsvfsp != NULL && ufsvfsp->vfs_root == vp) {
 363                                 found = 1;
 364                                 break;
 365                         }
 366                         vfsp = vfsp->vfs_next;
 367                 } while (vfsp != rootvfs);
 368                 vfs_list_unlock();
 369 
 370                 /*
 371                  * If we have found a filesystem mounted on the current
 372                  * path prefix, remember the end of the string in
 373                  * "longest".  If it happens to be the the exact fs
 374                  * saved in the configuration file, save the current
 375                  * ufsvfsp so we can make additional checks further down.
 376                  */
 377                 if (found) {
 378                         longest = slash;
 379                         if (strcmp(cf->cf_fs, sfpath) == 0) {
 380                                 ufsvfsp_save = ufsvfsp;
 381                                 vfsp_save = vfsp;
 382                         }
 383                         found = 0;
 384                 }
 385 
 386                 VN_RELE(vp);
 387                 *slash = '/';
 388                 tail = slash + 1;
 389         }
 390         *longest = '\0';
 391         if (cpr_is_ufs(vfsp_save) == 0 || strcmp(cf->cf_fs, sfpath)) {
 392                 cpr_err(CE_CONT, "Filesystem containing "
 393                     "the statefile when pmconfig was run (%s) has "
 394                     "changed to %s. %s\n", cf->cf_fs, sfpath, rerun);
 395                 return (EINVAL);
 396         }
 397 
 398         if ((error = lookupname(cf->cf_devfs,
 399             UIO_SYSSPACE, FOLLOW, NULLVPP, &vp)) != 0) {
 400                 cpr_err(CE_CONT, lookup_fmt, cf->cf_devfs);
 401                 return (error);
 402         }
 403 
 404         if (ufsvfsp_save->vfs_devvp->v_rdev != vp->v_rdev) {
 405                 cpr_err(CE_CONT, "Filesystem containing "
 406                     "statefile no longer mounted on device %s. "
 407                     "See power.conf(4).", cf->cf_devfs);
 408                 VN_RELE(vp);
 409                 return (ENXIO);
 410         }
 411         VN_RELE(vp);
 412 
 413         error = i_devname_to_promname(cf->cf_devfs, devpath, OBP_MAXPATHLEN);
 414         if (error || strcmp(devpath, cf->cf_dev_prom)) {
 415                 cpr_err(CE_CONT, path_chg_fmt,
 416                     cf->cf_dev_prom, devpath, rerun);
 417                 return (error);
 418         }
 419 
 420         return (0);
 421 }
 422 
 423 /*
 424  * Make sure that the statefile can be used as a block special statefile
 425  * (meaning that is exists and has nothing mounted on it)
 426  * Returns errno if not a valid statefile.
 427  */
 428 int
 429 cpr_check_spec_statefile(void)
 430 {
 431         int err;
 432 
 433         if (err = cpr_get_config())
 434                 return (err);
 435         ASSERT(cprconfig.cf_type == CFT_SPEC ||
 436             cprconfig.cf_type == CFT_ZVOL);
 437 
 438         if (cprconfig.cf_devfs == NULL)
 439                 return (ENXIO);
 440 
 441         return (cpr_verify_statefile_path());
 442 
 443 }
 444 
 445 int
 446 cpr_alloc_statefile(int alloc_retry)
 447 {
 448         register int rc = 0;
 449         char *str;
 450 
 451         /*
 452          * Statefile size validation. If checkpoint the first time, disk blocks
 453          * allocation will be done; otherwise, just do file size check.
 454          * if statefile allocation is being retried, C_VP will be inited
 455          */
 456         if (alloc_retry) {
 457                 str = "\n-->Retrying statefile allocation...";
 458                 if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG7))
 459                         prom_printf(str);
 460                 if (C_VP->v_type != VBLK)
 461                         (void) VOP_DUMPCTL(C_VP, DUMP_FREE, NULL, NULL);
 462         } else {
 463                 /*
 464                  * Open an exiting file for writing, the state file needs to be
 465                  * pre-allocated since we can't and don't want to do allocation
 466                  * during checkpoint (too much of the OS is disabled).
 467                  *    - do a preliminary size checking here, if it is too small,
 468                  *      allocate more space internally and retry.
 469                  *    - check the vp to make sure it's the right type.
 470                  */
 471                 char *path = cpr_build_statefile_path();
 472 
 473                 if (path == NULL)
 474                         return (ENXIO);
 475                 else if (rc = cpr_verify_statefile_path())
 476                         return (rc);
 477 
 478                 if (rc = vn_open(path, UIO_SYSSPACE,
 479                     FCREAT|FWRITE, 0600, &C_VP, CRCREAT, 0)) {
 480                         cpr_err(CE_WARN, "cannot open statefile %s", path);
 481                         return (rc);
 482                 }
 483         }
 484 
 485         /*
 486          * Only ufs and block special statefiles supported
 487          */
 488         if (C_VP->v_type != VREG && C_VP->v_type != VBLK) {
 489                 cpr_err(CE_CONT,
 490                     "Statefile must be regular file or block special file.");
 491                 return (EACCES);
 492         }
 493 
 494         if (rc = cpr_statefile_ok(C_VP, alloc_retry))
 495                 return (rc);
 496 
 497         if (C_VP->v_type != VBLK) {
 498                 /*
 499                  * sync out the fs change due to the statefile reservation.
 500                  */
 501                 (void) VFS_SYNC(C_VP->v_vfsp, 0, CRED());
 502 
 503                 /*
 504                  * Validate disk blocks allocation for the state file.
 505                  * Ask the file system prepare itself for the dump operation.
 506                  */
 507                 if (rc = VOP_DUMPCTL(C_VP, DUMP_ALLOC, NULL, NULL)) {
 508                         cpr_err(CE_CONT, "Error allocating "
 509                             "blocks for cpr statefile.");
 510                         return (rc);
 511                 }
 512         }
 513         return (0);
 514 }
 515 
 516 
 517 /*
 518  * Lookup device size and return available space in bytes.
 519  * NOTE: Since prop_op(9E) can't tell the difference between a character
 520  * and a block reference, it is ok to ask for "Size" instead of "Nblocks".
 521  */
 522 size_t
 523 cpr_get_devsize(dev_t dev)
 524 {
 525         size_t bytes = 0;
 526 
 527         bytes = cdev_Size(dev);
 528         if (bytes == 0)
 529                 bytes = cdev_size(dev);
 530 
 531         if (bytes > CPR_SPEC_OFFSET)
 532                 bytes -= CPR_SPEC_OFFSET;
 533         else
 534                 bytes = 0;
 535 
 536         return (bytes);
 537 }
 538 
 539 
 540 /*
 541  * increase statefile size
 542  */
 543 static int
 544 cpr_grow_statefile(vnode_t *vp, u_longlong_t newsize)
 545 {
 546         extern uchar_t cpr_pagecopy[];
 547         struct inode *ip = VTOI(vp);
 548         u_longlong_t offset;
 549         int error, increase;
 550         ssize_t resid;
 551 
 552         rw_enter(&ip->i_contents, RW_READER);
 553         increase = (ip->i_size < newsize);
 554         offset = ip->i_size;
 555         rw_exit(&ip->i_contents);
 556 
 557         if (increase == 0)
 558                 return (0);
 559 
 560         /*
 561          * write to each logical block to reserve disk space
 562          */
 563         error = 0;
 564         cpr_pagecopy[0] = '1';
 565         for (; offset < newsize; offset += ip->i_fs->fs_bsize) {
 566                 if (error = vn_rdwr(UIO_WRITE, vp, (caddr_t)cpr_pagecopy,
 567                     ip->i_fs->fs_bsize, (offset_t)offset, UIO_SYSSPACE, 0,
 568                     (rlim64_t)MAXOFF_T, CRED(), &resid)) {
 569                         if (error == ENOSPC) {
 570                                 cpr_err(CE_WARN, "error %d while reserving "
 571                                     "disk space for statefile %s\n"
 572                                     "wanted %lld bytes, file is %lld short",
 573                                     error, cpr_cprconfig_to_path(),
 574                                     newsize, newsize - offset);
 575                         }
 576                         break;
 577                 }
 578         }
 579         return (error);
 580 }
 581 
 582 
 583 /*
 584  * do a simple estimate of the space needed to hold the statefile
 585  * taking compression into account, but be fairly conservative
 586  * so we have a better chance of completing; when dump fails,
 587  * the retry cost is fairly high.
 588  *
 589  * Do disk blocks allocation for the state file if no space has
 590  * been allocated yet. Since the state file will not be removed,
 591  * allocation should only be done once.
 592  */
 593 static int
 594 cpr_statefile_ok(vnode_t *vp, int alloc_retry)
 595 {
 596         extern size_t cpr_bitmap_size;
 597         struct inode *ip = VTOI(vp);
 598         const int UCOMP_RATE = 20; /* comp. ratio*10 for user pages */
 599         u_longlong_t size, isize, ksize, raw_data;
 600         char *str, *est_fmt;
 601         size_t space;
 602         int error;
 603 
 604         /*
 605          * number of pages short for swapping.
 606          */
 607         STAT->cs_nosw_pages = k_anoninfo.ani_mem_resv;
 608         if (STAT->cs_nosw_pages < 0)
 609                 STAT->cs_nosw_pages = 0;
 610 
 611         str = "cpr_statefile_ok:";
 612 
 613         CPR_DEBUG(CPR_DEBUG9, "Phys swap: max=%lu resv=%lu\n",
 614             k_anoninfo.ani_max, k_anoninfo.ani_phys_resv);
 615         CPR_DEBUG(CPR_DEBUG9, "Mem swap: max=%ld resv=%lu\n",
 616             MAX(availrmem - swapfs_minfree, 0),
 617             k_anoninfo.ani_mem_resv);
 618         CPR_DEBUG(CPR_DEBUG9, "Total available swap: %ld\n",
 619             CURRENT_TOTAL_AVAILABLE_SWAP);
 620 
 621         /*
 622          * try increasing filesize by 15%
 623          */
 624         if (alloc_retry) {
 625                 /*
 626                  * block device doesn't get any bigger
 627                  */
 628                 if (vp->v_type == VBLK) {
 629                         if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6))
 630                                 prom_printf(
 631                                     "Retry statefile on special file\n");
 632                         return (ENOMEM);
 633                 } else {
 634                         rw_enter(&ip->i_contents, RW_READER);
 635                         size = (ip->i_size * SIZE_RATE) / INTEGRAL;
 636                         rw_exit(&ip->i_contents);
 637                 }
 638                 if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6))
 639                         prom_printf("Retry statefile size = %lld\n", size);
 640         } else {
 641                 u_longlong_t cpd_size;
 642                 pgcnt_t npages, nback;
 643                 int ndvram;
 644 
 645                 ndvram = 0;
 646                 (void) callb_execute_class(CB_CL_CPR_FB,
 647                     (int)(uintptr_t)&ndvram);
 648                 if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6))
 649                         prom_printf("ndvram size = %d\n", ndvram);
 650 
 651                 /*
 652                  * estimate 1 cpd_t for every (CPR_MAXCONTIG / 2) pages
 653                  */
 654                 npages = cpr_count_kpages(REGULAR_BITMAP, cpr_nobit);
 655                 cpd_size = sizeof (cpd_t) * (npages / (CPR_MAXCONTIG / 2));
 656                 raw_data = cpd_size + cpr_bitmap_size;
 657                 ksize = ndvram + mmu_ptob(npages);
 658 
 659                 est_fmt = "%s estimated size with "
 660                     "%scompression %lld, ksize %lld\n";
 661                 nback = mmu_ptob(STAT->cs_nosw_pages);
 662                 if (CPR->c_flags & C_COMPRESSING) {
 663                         size = ((ksize * COMPRESS_PERCENT) / INTEGRAL) +
 664                             raw_data + ((nback * 10) / UCOMP_RATE);
 665                         CPR_DEBUG(CPR_DEBUG1, est_fmt, str, "", size, ksize);
 666                 } else {
 667                         size = ksize + raw_data + nback;
 668                         CPR_DEBUG(CPR_DEBUG1, est_fmt, str, "no ",
 669                             size, ksize);
 670                 }
 671         }
 672 
 673         /*
 674          * All this is much simpler for a block device
 675          */
 676         if (vp->v_type == VBLK) {
 677                 space = cpr_get_devsize(vp->v_rdev);
 678                 if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6))
 679                         prom_printf("statefile dev size %lu\n", space);
 680 
 681                 /*
 682                  * Export the estimated filesize info, this value will be
 683                  * compared before dumping out the statefile in the case of
 684                  * no compression.
 685                  */
 686                 STAT->cs_est_statefsz = size;
 687                 if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6))
 688                         prom_printf("%s Estimated statefile size %llu, "
 689                             "space %lu\n", str, size, space);
 690                 if (size > space) {
 691                         cpr_err(CE_CONT, "Statefile partition too small.");
 692                         return (ENOMEM);
 693                 }
 694                 return (0);
 695         } else {
 696                 if (CPR->c_alloc_cnt++ > C_MAX_ALLOC_RETRY) {
 697                         cpr_err(CE_CONT, "Statefile allocation retry failed\n");
 698                         return (ENOMEM);
 699                 }
 700 
 701                 /*
 702                  * Estimate space needed for the state file.
 703                  *
 704                  * State file size in bytes:
 705                  *      kernel size + non-cache pte seg +
 706                  *      bitmap size + cpr state file headers size
 707                  * (round up to fs->fs_bsize)
 708                  */
 709                 size = blkroundup(ip->i_fs, size);
 710 
 711                 /*
 712                  * Export the estimated filesize info, this value will be
 713                  * compared before dumping out the statefile in the case of
 714                  * no compression.
 715                  */
 716                 STAT->cs_est_statefsz = size;
 717                 error = cpr_grow_statefile(vp, size);
 718                 if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6)) {
 719                         rw_enter(&ip->i_contents, RW_READER);
 720                         isize = ip->i_size;
 721                         rw_exit(&ip->i_contents);
 722                         prom_printf("%s Estimated statefile size %lld, "
 723                             "i_size %lld\n", str, size, isize);
 724                 }
 725 
 726                 return (error);
 727         }
 728 }
 729 
 730 
 731 void
 732 cpr_statef_close(void)
 733 {
 734         if (C_VP) {
 735                 if (!cpr_reusable_mode)
 736                         (void) VOP_DUMPCTL(C_VP, DUMP_FREE, NULL, NULL);
 737                 (void) VOP_CLOSE(C_VP, FWRITE, 1, (offset_t)0, CRED(), NULL);
 738                 VN_RELE(C_VP);
 739                 C_VP = 0;
 740         }
 741 }
 742 
 743 
 744 /*
 745  * open cpr default file and display error
 746  */
 747 int
 748 cpr_open_deffile(int mode, vnode_t **vpp)
 749 {
 750         int error;
 751 
 752         if (error = cpr_open(cpr_default_path, mode, vpp))
 753                 cpr_err(CE_CONT, "cannot open \"%s\", error %d\n",
 754                     cpr_default_path, error);
 755         return (error);
 756 }
 757 
 758 
 759 /*
 760  * write cdef_t to disk.  This contains the original values of prom
 761  * properties that we modify.  We fill in the magic number of the file
 762  * here as a signal to the booter code that the state file is valid.
 763  * Be sure the file gets synced, since we may be shutting down the OS.
 764  */
 765 int
 766 cpr_write_deffile(cdef_t *cdef)
 767 {
 768         struct vnode *vp;
 769         char *str;
 770         int rc;
 771 
 772         if (rc = cpr_open_deffile(FCREAT|FWRITE, &vp))
 773                 return (rc);
 774 
 775         if (rc = cpr_rdwr(UIO_WRITE, vp, cdef, sizeof (*cdef)))
 776                 str = "write";
 777         else if (rc = VOP_FSYNC(vp, FSYNC, CRED(), NULL))
 778                 str = "fsync";
 779         (void) VOP_CLOSE(vp, FWRITE, 1, (offset_t)0, CRED(), NULL);
 780         VN_RELE(vp);
 781 
 782         if (rc) {
 783                 cpr_err(CE_WARN, "%s error %d, file \"%s\"",
 784                     str, rc, cpr_default_path);
 785         }
 786         return (rc);
 787 }
 788 
 789 /*
 790  * Clear the magic number in the defaults file.  This tells the booter
 791  * program that the state file is not current and thus prevents
 792  * any attempt to restore from an obsolete state file.
 793  */
 794 void
 795 cpr_clear_definfo(void)
 796 {
 797         struct vnode *vp;
 798         cmini_t mini;
 799 
 800         if ((CPR->c_cprboot_magic != CPR_DEFAULT_MAGIC) ||
 801             cpr_open_deffile(FCREAT|FWRITE, &vp))
 802                 return;
 803         mini.magic = mini.reusable = 0;
 804         (void) cpr_rdwr(UIO_WRITE, vp, &mini, sizeof (mini));
 805         (void) VOP_CLOSE(vp, FWRITE, 1, (offset_t)0, CRED(), NULL);
 806         VN_RELE(vp);
 807 }
 808 
 809 /*
 810  * If the cpr default file is invalid, then we must not be in reusable mode
 811  * if it is valid, it tells us our mode
 812  */
 813 int
 814 cpr_get_reusable_mode(void)
 815 {
 816         struct vnode *vp;
 817         cmini_t mini;
 818         int rc;
 819 
 820         if (cpr_open(cpr_default_path, FREAD, &vp))
 821                 return (0);
 822 
 823         rc = cpr_rdwr(UIO_READ, vp, &mini, sizeof (mini));
 824         (void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED(), NULL);
 825         VN_RELE(vp);
 826         if (rc == 0 && mini.magic == CPR_DEFAULT_MAGIC)
 827                 return (mini.reusable);
 828 
 829         return (0);
 830 }
 831 #endif
 832 
 833 /*
 834  * clock/time related routines
 835  */
 836 static time_t   cpr_time_stamp;
 837 
 838 
 839 void
 840 cpr_tod_get(cpr_time_t *ctp)
 841 {
 842         timestruc_t ts;
 843 
 844         mutex_enter(&tod_lock);
 845         ts = TODOP_GET(tod_ops);
 846         mutex_exit(&tod_lock);
 847         ctp->tv_sec = (time32_t)ts.tv_sec;
 848         ctp->tv_nsec = (int32_t)ts.tv_nsec;
 849 }
 850 
 851 void
 852 cpr_tod_status_set(int tod_flag)
 853 {
 854         mutex_enter(&tod_lock);
 855         tod_status_set(tod_flag);
 856         mutex_exit(&tod_lock);
 857 }
 858 
 859 void
 860 cpr_save_time(void)
 861 {
 862         cpr_time_stamp = gethrestime_sec();
 863 }
 864 
 865 /*
 866  * correct time based on saved time stamp or hardware clock
 867  */
 868 void
 869 cpr_restore_time(void)
 870 {
 871         clkset(cpr_time_stamp);
 872 }
 873 
 874 #if defined(__sparc)
 875 /*
 876  * CPU ONLINE/OFFLINE CODE
 877  */
 878 int
 879 cpr_mp_offline(void)
 880 {
 881         cpu_t *cp, *bootcpu;
 882         int rc = 0;
 883         int brought_up_boot = 0;
 884 
 885         /*
 886          * Do nothing for UP.
 887          */
 888         if (ncpus == 1)
 889                 return (0);
 890 
 891         mutex_enter(&cpu_lock);
 892 
 893         cpr_save_mp_state();
 894 
 895         bootcpu = i_cpr_bootcpu();
 896         if (!CPU_ACTIVE(bootcpu)) {
 897                 if ((rc = cpr_p_online(bootcpu, CPU_CPR_ONLINE))) {
 898                         mutex_exit(&cpu_lock);
 899                         return (rc);
 900                 }
 901                 brought_up_boot = 1;
 902         }
 903 
 904         cp = cpu_list;
 905         do {
 906                 if (cp == bootcpu)
 907                         continue;
 908                 if (cp->cpu_flags & CPU_OFFLINE)
 909                         continue;
 910                 if ((rc = cpr_p_online(cp, CPU_CPR_OFFLINE))) {
 911                         mutex_exit(&cpu_lock);
 912                         return (rc);
 913                 }
 914         } while ((cp = cp->cpu_next) != cpu_list);
 915         if (brought_up_boot && (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6)))
 916                 prom_printf("changed cpu %p to state %d\n",
 917                     (void *)bootcpu, CPU_CPR_ONLINE);
 918         mutex_exit(&cpu_lock);
 919 
 920         return (rc);
 921 }
 922 
 923 int
 924 cpr_mp_online(void)
 925 {
 926         cpu_t *cp, *bootcpu = CPU;
 927         int rc = 0;
 928 
 929         /*
 930          * Do nothing for UP.
 931          */
 932         if (ncpus == 1)
 933                 return (0);
 934 
 935         /*
 936          * cpr_save_mp_state() sets CPU_CPR_ONLINE in cpu_cpr_flags
 937          * to indicate a cpu was online at the time of cpr_suspend();
 938          * now restart those cpus that were marked as CPU_CPR_ONLINE
 939          * and actually are offline.
 940          */
 941         mutex_enter(&cpu_lock);
 942         for (cp = bootcpu->cpu_next; cp != bootcpu; cp = cp->cpu_next) {
 943                 /*
 944                  * Clear the CPU_FROZEN flag in all cases.
 945                  */
 946                 cp->cpu_flags &= ~CPU_FROZEN;
 947 
 948                 if (CPU_CPR_IS_OFFLINE(cp))
 949                         continue;
 950                 if (CPU_ACTIVE(cp))
 951                         continue;
 952                 if ((rc = cpr_p_online(cp, CPU_CPR_ONLINE))) {
 953                         mutex_exit(&cpu_lock);
 954                         return (rc);
 955                 }
 956         }
 957 
 958         /*
 959          * turn off the boot cpu if it was offlined
 960          */
 961         if (CPU_CPR_IS_OFFLINE(bootcpu)) {
 962                 if ((rc = cpr_p_online(bootcpu, CPU_CPR_OFFLINE))) {
 963                         mutex_exit(&cpu_lock);
 964                         return (rc);
 965                 }
 966         }
 967         mutex_exit(&cpu_lock);
 968         return (0);
 969 }
 970 
 971 static void
 972 cpr_save_mp_state(void)
 973 {
 974         cpu_t *cp;
 975 
 976         ASSERT(MUTEX_HELD(&cpu_lock));
 977 
 978         cp = cpu_list;
 979         do {
 980                 cp->cpu_cpr_flags &= ~CPU_CPR_ONLINE;
 981                 if (CPU_ACTIVE(cp))
 982                         CPU_SET_CPR_FLAGS(cp, CPU_CPR_ONLINE);
 983         } while ((cp = cp->cpu_next) != cpu_list);
 984 }
 985 
 986 /*
 987  * change cpu to online/offline
 988  */
 989 static int
 990 cpr_p_online(cpu_t *cp, int state)
 991 {
 992         int rc;
 993 
 994         ASSERT(MUTEX_HELD(&cpu_lock));
 995 
 996         switch (state) {
 997         case CPU_CPR_ONLINE:
 998                 rc = cpu_online(cp);
 999                 break;
1000         case CPU_CPR_OFFLINE:
1001                 rc = cpu_offline(cp, CPU_FORCED);
1002                 break;
1003         }
1004         if (rc) {
1005                 cpr_err(CE_WARN, "Failed to change processor %d to "
1006                     "state %d, (errno %d)", cp->cpu_id, state, rc);
1007         }
1008         return (rc);
1009 }
1010 
1011 /*
1012  * Construct the pathname of the state file and return a pointer to
1013  * caller.  Read the config file to get the mount point of the
1014  * filesystem and the pathname within fs.
1015  */
1016 char *
1017 cpr_build_statefile_path(void)
1018 {
1019         struct cprconfig *cf = &cprconfig;
1020 
1021         if (cpr_get_config())
1022                 return (NULL);
1023 
1024         switch (cf->cf_type) {
1025         case CFT_UFS:
1026                 if (strlen(cf->cf_path) + strlen(cf->cf_fs) >= MAXNAMELEN - 1) {
1027                         cpr_err(CE_CONT, "Statefile path is too long.\n");
1028                         return (NULL);
1029                 }
1030                 return (cpr_cprconfig_to_path());
1031         case CFT_ZVOL:
1032                 /*FALLTHROUGH*/
1033         case CFT_SPEC:
1034                 return (cf->cf_devfs);
1035         default:
1036                 cpr_err(CE_PANIC, "invalid statefile type");
1037                 /*NOTREACHED*/
1038                 return (NULL);
1039         }
1040 }
1041 
1042 int
1043 cpr_statefile_is_spec(void)
1044 {
1045         if (cpr_get_config())
1046                 return (0);
1047         return (cprconfig.cf_type == CFT_SPEC);
1048 }
1049 
1050 char *
1051 cpr_get_statefile_prom_path(void)
1052 {
1053         struct cprconfig *cf = &cprconfig;
1054 
1055         ASSERT(cprconfig_loaded);
1056         ASSERT(cf->cf_magic == CPR_CONFIG_MAGIC);
1057         ASSERT(cf->cf_type == CFT_SPEC || cf->cf_type == CFT_ZVOL);
1058         return (cf->cf_dev_prom);
1059 }
1060 
1061 
1062 /*
1063  * XXX The following routines need to be in the vfs source code.
1064  */
1065 
1066 int
1067 cpr_is_ufs(struct vfs *vfsp)
1068 {
1069         char *fsname;
1070 
1071         fsname = vfssw[vfsp->vfs_fstype].vsw_name;
1072         return (strcmp(fsname, "ufs") == 0);
1073 }
1074 
1075 int
1076 cpr_is_zfs(struct vfs *vfsp)
1077 {
1078         char *fsname;
1079 
1080         fsname = vfssw[vfsp->vfs_fstype].vsw_name;
1081         return (strcmp(fsname, "zfs") == 0);
1082 }
1083 
1084 /*
1085  * This is a list of file systems that are allowed to be writeable when a
1086  * reusable statefile checkpoint is taken.  They must not have any state that
1087  * cannot be restored to consistency by simply rebooting using the checkpoint.
1088  * (In contrast to ufs, cachefs and pcfs which have disk state that could get
1089  * out of sync with the in-kernel data).
1090  */
1091 int
1092 cpr_reusable_mount_check(void)
1093 {
1094         struct vfs *vfsp;
1095         char *fsname;
1096         char **cpp;
1097         static char *cpr_writeok_fss[] = {
1098                 "autofs", "devfs", "fd", "lofs", "mntfs", "namefs", "nfs",
1099                 "proc", "tmpfs", "ctfs", "objfs", "dev", NULL
1100         };
1101 
1102         vfs_list_read_lock();
1103         vfsp = rootvfs;
1104         do {
1105                 if (vfsp->vfs_flag & VFS_RDONLY) {
1106                         vfsp = vfsp->vfs_next;
1107                         continue;
1108                 }
1109                 fsname = vfssw[vfsp->vfs_fstype].vsw_name;
1110                 for (cpp = cpr_writeok_fss; *cpp; cpp++) {
1111                         if (strcmp(fsname, *cpp) == 0)
1112                                 break;
1113                 }
1114                 /*
1115                  * if the inner loop reached the NULL terminator,
1116                  * the current fs-type does not match any OK-type
1117                  */
1118                 if (*cpp == NULL) {
1119                         cpr_err(CE_CONT, "a filesystem of type %s is "
1120                             "mounted read/write.\nReusable statefile requires "
1121                             "no writeable filesystem of this type be mounted\n",
1122                             fsname);
1123                         vfs_list_unlock();
1124                         return (EINVAL);
1125                 }
1126                 vfsp = vfsp->vfs_next;
1127         } while (vfsp != rootvfs);
1128         vfs_list_unlock();
1129         return (0);
1130 }
1131 
1132 /*
1133  * return statefile offset in DEV_BSIZE units
1134  */
1135 int
1136 cpr_statefile_offset(void)
1137 {
1138         return (cprconfig.cf_type != CFT_UFS ? btod(CPR_SPEC_OFFSET) : 0);
1139 }
1140 
1141 /*
1142  * Force a fresh read of the cprinfo per uadmin 3 call
1143  */
1144 void
1145 cpr_forget_cprconfig(void)
1146 {
1147         cprconfig_loaded = 0;
1148 }
1149 #endif