1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
  25  */
  26 
  27 #include <sys/types.h>
  28 #include <sys/errno.h>
  29 #include <sys/cpuvar.h>
  30 #include <sys/vfs.h>
  31 #include <sys/vnode.h>
  32 #include <sys/pathname.h>
  33 #include <sys/callb.h>
  34 #include <sys/fs/ufs_inode.h>
  35 #include <vm/anon.h>
  36 #include <sys/fs/swapnode.h>      /* for swapfs_minfree */
  37 #include <sys/kmem.h>
  38 #include <sys/cpr.h>
  39 #include <sys/conf.h>
  40 #include <sys/machclock.h>
  41 
  42 /*
  43  * CPR miscellaneous support routines
  44  */
  45 #define cpr_open(path, mode,  vpp)      (vn_open(path, UIO_SYSSPACE, \
  46                 mode, 0600, vpp, CRCREAT, 0))
  47 #define cpr_rdwr(rw, vp, basep, cnt)    (vn_rdwr(rw, vp,  (caddr_t)(basep), \
  48                 cnt, 0LL, UIO_SYSSPACE, 0, (rlim64_t)MAXOFF_T, CRED(), \
  49                 (ssize_t *)NULL))
  50 
  51 extern void clkset(time_t);
  52 extern cpu_t *i_cpr_bootcpu(void);
  53 extern caddr_t i_cpr_map_setup(void);
  54 extern void i_cpr_free_memory_resources(void);
  55 
  56 extern kmutex_t cpr_slock;
  57 extern size_t cpr_buf_size;
  58 extern char *cpr_buf;
  59 extern size_t cpr_pagedata_size;
  60 extern char *cpr_pagedata;
  61 extern int cpr_bufs_allocated;
  62 extern int cpr_bitmaps_allocated;
  63 
  64 #if defined(__sparc)
  65 static struct cprconfig cprconfig;
  66 static int cprconfig_loaded = 0;
  67 static int cpr_statefile_ok(vnode_t *, int);
  68 static int cpr_p_online(cpu_t *, int);
  69 static void cpr_save_mp_state(void);
  70 #endif
  71 
  72 int cpr_is_ufs(struct vfs *);
  73 int cpr_is_zfs(struct vfs *);
  74 
  75 char cpr_default_path[] = CPR_DEFAULT;
  76 
  77 #define COMPRESS_PERCENT 40     /* approx compression ratio in percent */
  78 #define SIZE_RATE       115     /* increase size by 15% */
  79 #define INTEGRAL        100     /* for integer math */
  80 
  81 
  82 /*
  83  * cmn_err() followed by a 1/4 second delay; this gives the
  84  * logging service a chance to flush messages and helps avoid
  85  * intermixing output from prom_printf().
  86  */
  87 /*PRINTFLIKE2*/
  88 void
  89 cpr_err(int ce, const char *fmt, ...)
  90 {
  91         va_list adx;
  92 
  93         va_start(adx, fmt);
  94         vcmn_err(ce, fmt, adx);
  95         va_end(adx);
  96         drv_usecwait(MICROSEC >> 2);
  97 }
  98 
  99 
 100 int
 101 cpr_init(int fcn)
 102 {
 103         /*
 104          * Allow only one suspend/resume process.
 105          */
 106         if (mutex_tryenter(&cpr_slock) == 0)
 107                 return (EBUSY);
 108 
 109         CPR->c_flags = 0;
 110         CPR->c_substate = 0;
 111         CPR->c_cprboot_magic = 0;
 112         CPR->c_alloc_cnt = 0;
 113 
 114         CPR->c_fcn = fcn;
 115         if (fcn == AD_CPR_REUSABLE)
 116                 CPR->c_flags |= C_REUSABLE;
 117         else
 118                 CPR->c_flags |= C_SUSPENDING;
 119         if (fcn == AD_SUSPEND_TO_RAM || fcn == DEV_SUSPEND_TO_RAM) {
 120                 return (0);
 121         }
 122 #if defined(__sparc)
 123         if (fcn != AD_CPR_NOCOMPRESS && fcn != AD_CPR_TESTNOZ)
 124                 CPR->c_flags |= C_COMPRESSING;
 125         /*
 126          * reserve CPR_MAXCONTIG virtual pages for cpr_dump()
 127          */
 128         CPR->c_mapping_area = i_cpr_map_setup();
 129         if (CPR->c_mapping_area == 0) {              /* no space in kernelmap */
 130                 cpr_err(CE_CONT, "Unable to alloc from kernelmap.\n");
 131                 mutex_exit(&cpr_slock);
 132                 return (EAGAIN);
 133         }
 134         if (cpr_debug & CPR_DEBUG3)
 135                 cpr_err(CE_CONT, "Reserved virtual range from 0x%p for writing "
 136                     "kas\n", (void *)CPR->c_mapping_area);
 137 #endif
 138 
 139         return (0);
 140 }
 141 
 142 /*
 143  * This routine releases any resources used during the checkpoint.
 144  */
 145 void
 146 cpr_done(void)
 147 {
 148         cpr_stat_cleanup();
 149         i_cpr_bitmap_cleanup();
 150 
 151         /*
 152          * Free pages used by cpr buffers.
 153          */
 154         if (cpr_buf) {
 155                 kmem_free(cpr_buf, cpr_buf_size);
 156                 cpr_buf = NULL;
 157         }
 158         if (cpr_pagedata) {
 159                 kmem_free(cpr_pagedata, cpr_pagedata_size);
 160                 cpr_pagedata = NULL;
 161         }
 162 
 163         i_cpr_free_memory_resources();
 164         mutex_exit(&cpr_slock);
 165         cpr_err(CE_CONT, "System has been resumed.\n");
 166 }
 167 
 168 
 169 #if defined(__sparc)
 170 /*
 171  * reads config data into cprconfig
 172  */
 173 static int
 174 cpr_get_config(void)
 175 {
 176         static char config_path[] = CPR_CONFIG;
 177         struct cprconfig *cf = &cprconfig;
 178         struct vnode *vp;
 179         char *fmt;
 180         int err;
 181 
 182         if (cprconfig_loaded)
 183                 return (0);
 184 
 185         fmt = "cannot %s config file \"%s\", error %d\n";
 186         if (err = vn_open(config_path, UIO_SYSSPACE, FREAD, 0, &vp, 0, 0)) {
 187                 cpr_err(CE_CONT, fmt, "open", config_path, err);
 188                 return (err);
 189         }
 190 
 191         err = cpr_rdwr(UIO_READ, vp, cf, sizeof (*cf));
 192         (void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED(), NULL);
 193         VN_RELE(vp);
 194         if (err) {
 195                 cpr_err(CE_CONT, fmt, "read", config_path, err);
 196                 return (err);
 197         }
 198 
 199         if (cf->cf_magic == CPR_CONFIG_MAGIC)
 200                 cprconfig_loaded = 1;
 201         else {
 202                 cpr_err(CE_CONT, "invalid config file \"%s\", "
 203                     "rerun pmconfig(1M)\n", config_path);
 204                 err = EINVAL;
 205         }
 206 
 207         return (err);
 208 }
 209 
 210 
 211 /*
 212  * concat fs and path fields of the cprconfig structure;
 213  * returns pointer to the base of static data
 214  */
 215 static char *
 216 cpr_cprconfig_to_path(void)
 217 {
 218         static char full_path[MAXNAMELEN];
 219         struct cprconfig *cf = &cprconfig;
 220         char *ptr;
 221 
 222         /*
 223          * build /fs/path without extra '/'
 224          */
 225         (void) strcpy(full_path, cf->cf_fs);
 226         if (strcmp(cf->cf_fs, "/"))
 227                 (void) strcat(full_path, "/");
 228         ptr = cf->cf_path;
 229         if (*ptr == '/')
 230                 ptr++;
 231         (void) strcat(full_path, ptr);
 232         return (full_path);
 233 }
 234 
 235 
 236 /*
 237  * Verify that the information in the configuration file regarding the
 238  * location for the statefile is still valid, depending on cf_type.
 239  * for CFT_UFS, cf_fs must still be a mounted filesystem, it must be
 240  *      mounted on the same device as when pmconfig was last run,
 241  *      and the translation of that device to a node in the prom's
 242  *      device tree must be the same as when pmconfig was last run.
 243  * for CFT_SPEC and CFT_ZVOL, cf_path must be the path to a block
 244  *      special file, it must have no file system mounted on it,
 245  *      and the translation of that device to a node in the prom's
 246  *      device tree must be the same as when pmconfig was last run.
 247  */
 248 static int
 249 cpr_verify_statefile_path(void)
 250 {
 251         struct cprconfig *cf = &cprconfig;
 252         static const char long_name[] = "Statefile pathname is too long.\n";
 253         static const char lookup_fmt[] = "Lookup failed for "
 254             "cpr statefile device %s.\n";
 255         static const char path_chg_fmt[] = "Device path for statefile "
 256             "has changed from %s to %s.\t%s\n";
 257         static const char rerun[] = "Please rerun pmconfig(1m).";
 258         struct vfs *vfsp = NULL, *vfsp_save = rootvfs;
 259         ufsvfs_t *ufsvfsp = (ufsvfs_t *)rootvfs->vfs_data;
 260         ufsvfs_t *ufsvfsp_save = ufsvfsp;
 261         int error;
 262         struct vnode *vp;
 263         char *slash, *tail, *longest;
 264         char *errstr;
 265         int found = 0;
 266         union {
 267                 char un_devpath[OBP_MAXPATHLEN];
 268                 char un_sfpath[MAXNAMELEN];
 269         } un;
 270 #define devpath un.un_devpath
 271 #define sfpath  un.un_sfpath
 272 
 273         ASSERT(cprconfig_loaded);
 274         /*
 275          * We need not worry about locking or the timing of releasing
 276          * the vnode, since we are single-threaded now.
 277          */
 278 
 279         switch (cf->cf_type) {
 280         case CFT_SPEC:
 281                 error = i_devname_to_promname(cf->cf_devfs, devpath,
 282                     OBP_MAXPATHLEN);
 283                 if (error || strcmp(devpath, cf->cf_dev_prom)) {
 284                         cpr_err(CE_CONT, path_chg_fmt,
 285                             cf->cf_dev_prom, devpath, rerun);
 286                         return (error);
 287                 }
 288                 /*FALLTHROUGH*/
 289         case CFT_ZVOL:
 290                 if (strlen(cf->cf_path) > sizeof (sfpath)) {
 291                         cpr_err(CE_CONT, long_name);
 292                         return (ENAMETOOLONG);
 293                 }
 294                 if ((error = lookupname(cf->cf_devfs,
 295                     UIO_SYSSPACE, FOLLOW, NULLVPP, &vp)) != 0) {
 296                         cpr_err(CE_CONT, lookup_fmt, cf->cf_devfs);
 297                         return (error);
 298                 }
 299                 if (vp->v_type != VBLK)
 300                         errstr = "statefile must be a block device";
 301                 else if (vfs_devismounted(vp->v_rdev))
 302                         errstr = "statefile device must not "
 303                             "have a file system mounted on it";
 304                 else if (IS_SWAPVP(vp))
 305                         errstr = "statefile device must not "
 306                             "be configured as swap file";
 307                 else
 308                         errstr = NULL;
 309 
 310                 VN_RELE(vp);
 311                 if (errstr) {
 312                         cpr_err(CE_CONT, "%s.\n", errstr);
 313                         return (ENOTSUP);
 314                 }
 315 
 316                 return (error);
 317         case CFT_UFS:
 318                 break;          /* don't indent all the original code */
 319         default:
 320                 cpr_err(CE_PANIC, "invalid cf_type");
 321         }
 322 
 323         /*
 324          * The original code for UFS statefile
 325          */
 326         if (strlen(cf->cf_fs) + strlen(cf->cf_path) + 2 > sizeof (sfpath)) {
 327                 cpr_err(CE_CONT, long_name);
 328                 return (ENAMETOOLONG);
 329         }
 330 
 331         bzero(sfpath, sizeof (sfpath));
 332         (void) strcpy(sfpath, cpr_cprconfig_to_path());
 333 
 334         if (*sfpath != '/') {
 335                 cpr_err(CE_CONT, "Statefile pathname %s "
 336                     "must begin with a /\n", sfpath);
 337                 return (EINVAL);
 338         }
 339 
 340         /*
 341          * Find the longest prefix of the statefile pathname which
 342          * is the mountpoint of a filesystem.  This string must
 343          * match the cf_fs field we read from the config file.  Other-
 344          * wise the user has changed things without running pmconfig.
 345          */
 346         tail = longest = sfpath + 1;    /* pt beyond the leading "/" */
 347         while ((slash = strchr(tail, '/')) != NULL) {
 348                 *slash = '\0';    /* temporarily terminate the string */
 349                 if ((error = lookupname(sfpath,
 350                     UIO_SYSSPACE, FOLLOW, NULLVPP, &vp)) != 0) {
 351                         *slash = '/';
 352                         cpr_err(CE_CONT, "A directory in the "
 353                             "statefile path %s was not found.\n", sfpath);
 354                         VN_RELE(vp);
 355 
 356                         return (error);
 357                 }
 358 
 359                 vfs_list_read_lock();
 360                 vfsp = rootvfs;
 361                 do {
 362                         ufsvfsp = (struct ufsvfs *)vfsp->vfs_data;
 363                         if (ufsvfsp != NULL && ufsvfsp->vfs_root == vp) {
 364                                 found = 1;
 365                                 break;
 366                         }
 367                         vfsp = vfsp->vfs_next;
 368                 } while (vfsp != rootvfs);
 369                 vfs_list_unlock();
 370 
 371                 /*
 372                  * If we have found a filesystem mounted on the current
 373                  * path prefix, remember the end of the string in
 374                  * "longest".  If it happens to be the the exact fs
 375                  * saved in the configuration file, save the current
 376                  * ufsvfsp so we can make additional checks further down.
 377                  */
 378                 if (found) {
 379                         longest = slash;
 380                         if (strcmp(cf->cf_fs, sfpath) == 0) {
 381                                 ufsvfsp_save = ufsvfsp;
 382                                 vfsp_save = vfsp;
 383                         }
 384                         found = 0;
 385                 }
 386 
 387                 VN_RELE(vp);
 388                 *slash = '/';
 389                 tail = slash + 1;
 390         }
 391         *longest = '\0';
 392         if (cpr_is_ufs(vfsp_save) == 0 || strcmp(cf->cf_fs, sfpath)) {
 393                 cpr_err(CE_CONT, "Filesystem containing "
 394                     "the statefile when pmconfig was run (%s) has "
 395                     "changed to %s. %s\n", cf->cf_fs, sfpath, rerun);
 396                 return (EINVAL);
 397         }
 398 
 399         if ((error = lookupname(cf->cf_devfs,
 400             UIO_SYSSPACE, FOLLOW, NULLVPP, &vp)) != 0) {
 401                 cpr_err(CE_CONT, lookup_fmt, cf->cf_devfs);
 402                 return (error);
 403         }
 404 
 405         if (ufsvfsp_save->vfs_devvp->v_rdev != vp->v_rdev) {
 406                 cpr_err(CE_CONT, "Filesystem containing "
 407                     "statefile no longer mounted on device %s. "
 408                     "See power.conf(4).", cf->cf_devfs);
 409                 VN_RELE(vp);
 410                 return (ENXIO);
 411         }
 412         VN_RELE(vp);
 413 
 414         error = i_devname_to_promname(cf->cf_devfs, devpath, OBP_MAXPATHLEN);
 415         if (error || strcmp(devpath, cf->cf_dev_prom)) {
 416                 cpr_err(CE_CONT, path_chg_fmt,
 417                     cf->cf_dev_prom, devpath, rerun);
 418                 return (error);
 419         }
 420 
 421         return (0);
 422 }
 423 
 424 /*
 425  * Make sure that the statefile can be used as a block special statefile
 426  * (meaning that is exists and has nothing mounted on it)
 427  * Returns errno if not a valid statefile.
 428  */
 429 int
 430 cpr_check_spec_statefile(void)
 431 {
 432         int err;
 433 
 434         if (err = cpr_get_config())
 435                 return (err);
 436         ASSERT(cprconfig.cf_type == CFT_SPEC ||
 437             cprconfig.cf_type == CFT_ZVOL);
 438 
 439         if (cprconfig.cf_devfs == NULL)
 440                 return (ENXIO);
 441 
 442         return (cpr_verify_statefile_path());
 443 
 444 }
 445 
 446 int
 447 cpr_alloc_statefile(int alloc_retry)
 448 {
 449         register int rc = 0;
 450         char *str;
 451 
 452         /*
 453          * Statefile size validation. If checkpoint the first time, disk blocks
 454          * allocation will be done; otherwise, just do file size check.
 455          * if statefile allocation is being retried, C_VP will be inited
 456          */
 457         if (alloc_retry) {
 458                 str = "\n-->Retrying statefile allocation...";
 459                 if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG7))
 460                         prom_printf(str);
 461                 if (C_VP->v_type != VBLK)
 462                         (void) VOP_DUMPCTL(C_VP, DUMP_FREE, NULL, NULL);
 463         } else {
 464                 /*
 465                  * Open an exiting file for writing, the state file needs to be
 466                  * pre-allocated since we can't and don't want to do allocation
 467                  * during checkpoint (too much of the OS is disabled).
 468                  *    - do a preliminary size checking here, if it is too small,
 469                  *      allocate more space internally and retry.
 470                  *    - check the vp to make sure it's the right type.
 471                  */
 472                 char *path = cpr_build_statefile_path();
 473 
 474                 if (path == NULL)
 475                         return (ENXIO);
 476                 else if (rc = cpr_verify_statefile_path())
 477                         return (rc);
 478 
 479                 if (rc = vn_open(path, UIO_SYSSPACE,
 480                     FCREAT|FWRITE, 0600, &C_VP, CRCREAT, 0)) {
 481                         cpr_err(CE_WARN, "cannot open statefile %s", path);
 482                         return (rc);
 483                 }
 484         }
 485 
 486         /*
 487          * Only ufs and block special statefiles supported
 488          */
 489         if (C_VP->v_type != VREG && C_VP->v_type != VBLK) {
 490                 cpr_err(CE_CONT,
 491                     "Statefile must be regular file or block special file.");
 492                 return (EACCES);
 493         }
 494 
 495         if (rc = cpr_statefile_ok(C_VP, alloc_retry))
 496                 return (rc);
 497 
 498         if (C_VP->v_type != VBLK) {
 499                 /*
 500                  * sync out the fs change due to the statefile reservation.
 501                  */
 502                 (void) VFS_SYNC(C_VP->v_vfsp, 0, CRED());
 503 
 504                 /*
 505                  * Validate disk blocks allocation for the state file.
 506                  * Ask the file system prepare itself for the dump operation.
 507                  */
 508                 if (rc = VOP_DUMPCTL(C_VP, DUMP_ALLOC, NULL, NULL)) {
 509                         cpr_err(CE_CONT, "Error allocating "
 510                             "blocks for cpr statefile.");
 511                         return (rc);
 512                 }
 513         }
 514         return (0);
 515 }
 516 
 517 
 518 /*
 519  * Lookup device size and return available space in bytes.
 520  * NOTE: Since prop_op(9E) can't tell the difference between a character
 521  * and a block reference, it is ok to ask for "Size" instead of "Nblocks".
 522  */
 523 size_t
 524 cpr_get_devsize(dev_t dev)
 525 {
 526         size_t bytes = 0;
 527 
 528         bytes = cdev_Size(dev);
 529         if (bytes == 0)
 530                 bytes = cdev_size(dev);
 531 
 532         if (bytes > CPR_SPEC_OFFSET)
 533                 bytes -= CPR_SPEC_OFFSET;
 534         else
 535                 bytes = 0;
 536 
 537         return (bytes);
 538 }
 539 
 540 
 541 /*
 542  * increase statefile size
 543  */
 544 static int
 545 cpr_grow_statefile(vnode_t *vp, u_longlong_t newsize)
 546 {
 547         extern uchar_t cpr_pagecopy[];
 548         struct inode *ip = VTOI(vp);
 549         u_longlong_t offset;
 550         int error, increase;
 551         ssize_t resid;
 552 
 553         rw_enter(&ip->i_contents, RW_READER);
 554         increase = (ip->i_size < newsize);
 555         offset = ip->i_size;
 556         rw_exit(&ip->i_contents);
 557 
 558         if (increase == 0)
 559                 return (0);
 560 
 561         /*
 562          * write to each logical block to reserve disk space
 563          */
 564         error = 0;
 565         cpr_pagecopy[0] = '1';
 566         for (; offset < newsize; offset += ip->i_fs->fs_bsize) {
 567                 if (error = vn_rdwr(UIO_WRITE, vp, (caddr_t)cpr_pagecopy,
 568                     ip->i_fs->fs_bsize, (offset_t)offset, UIO_SYSSPACE, 0,
 569                     (rlim64_t)MAXOFF_T, CRED(), &resid)) {
 570                         if (error == ENOSPC) {
 571                                 cpr_err(CE_WARN, "error %d while reserving "
 572                                     "disk space for statefile %s\n"
 573                                     "wanted %lld bytes, file is %lld short",
 574                                     error, cpr_cprconfig_to_path(),
 575                                     newsize, newsize - offset);
 576                         }
 577                         break;
 578                 }
 579         }
 580         return (error);
 581 }
 582 
 583 
 584 /*
 585  * do a simple estimate of the space needed to hold the statefile
 586  * taking compression into account, but be fairly conservative
 587  * so we have a better chance of completing; when dump fails,
 588  * the retry cost is fairly high.
 589  *
 590  * Do disk blocks allocation for the state file if no space has
 591  * been allocated yet. Since the state file will not be removed,
 592  * allocation should only be done once.
 593  */
 594 static int
 595 cpr_statefile_ok(vnode_t *vp, int alloc_retry)
 596 {
 597         extern size_t cpr_bitmap_size;
 598         struct inode *ip = VTOI(vp);
 599         const int UCOMP_RATE = 20; /* comp. ratio*10 for user pages */
 600         u_longlong_t size, isize, ksize, raw_data;
 601         char *str, *est_fmt;
 602         size_t space;
 603         int error;
 604 
 605         /*
 606          * number of pages short for swapping.
 607          */
 608         STAT->cs_nosw_pages = k_anoninfo.ani_mem_resv;
 609         if (STAT->cs_nosw_pages < 0)
 610                 STAT->cs_nosw_pages = 0;
 611 
 612         str = "cpr_statefile_ok:";
 613 
 614         CPR_DEBUG(CPR_DEBUG9, "Phys swap: max=%lu resv=%lu\n",
 615             k_anoninfo.ani_max, k_anoninfo.ani_phys_resv);
 616         CPR_DEBUG(CPR_DEBUG9, "Mem swap: max=%ld resv=%lu\n",
 617             MAX(availrmem - swapfs_minfree, 0),
 618             k_anoninfo.ani_mem_resv);
 619         CPR_DEBUG(CPR_DEBUG9, "Total available swap: %ld\n",
 620             CURRENT_TOTAL_AVAILABLE_SWAP);
 621 
 622         /*
 623          * try increasing filesize by 15%
 624          */
 625         if (alloc_retry) {
 626                 /*
 627                  * block device doesn't get any bigger
 628                  */
 629                 if (vp->v_type == VBLK) {
 630                         if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6))
 631                                 prom_printf(
 632                                     "Retry statefile on special file\n");
 633                         return (ENOMEM);
 634                 } else {
 635                         rw_enter(&ip->i_contents, RW_READER);
 636                         size = (ip->i_size * SIZE_RATE) / INTEGRAL;
 637                         rw_exit(&ip->i_contents);
 638                 }
 639                 if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6))
 640                         prom_printf("Retry statefile size = %lld\n", size);
 641         } else {
 642                 u_longlong_t cpd_size;
 643                 pgcnt_t npages, nback;
 644                 int ndvram;
 645 
 646                 ndvram = 0;
 647                 (void) callb_execute_class(CB_CL_CPR_FB,
 648                     (int)(uintptr_t)&ndvram);
 649                 if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6))
 650                         prom_printf("ndvram size = %d\n", ndvram);
 651 
 652                 /*
 653                  * estimate 1 cpd_t for every (CPR_MAXCONTIG / 2) pages
 654                  */
 655                 npages = cpr_count_kpages(REGULAR_BITMAP, cpr_nobit);
 656                 cpd_size = sizeof (cpd_t) * (npages / (CPR_MAXCONTIG / 2));
 657                 raw_data = cpd_size + cpr_bitmap_size;
 658                 ksize = ndvram + mmu_ptob(npages);
 659 
 660                 est_fmt = "%s estimated size with "
 661                     "%scompression %lld, ksize %lld\n";
 662                 nback = mmu_ptob(STAT->cs_nosw_pages);
 663                 if (CPR->c_flags & C_COMPRESSING) {
 664                         size = ((ksize * COMPRESS_PERCENT) / INTEGRAL) +
 665                             raw_data + ((nback * 10) / UCOMP_RATE);
 666                         CPR_DEBUG(CPR_DEBUG1, est_fmt, str, "", size, ksize);
 667                 } else {
 668                         size = ksize + raw_data + nback;
 669                         CPR_DEBUG(CPR_DEBUG1, est_fmt, str, "no ",
 670                             size, ksize);
 671                 }
 672         }
 673 
 674         /*
 675          * All this is much simpler for a block device
 676          */
 677         if (vp->v_type == VBLK) {
 678                 space = cpr_get_devsize(vp->v_rdev);
 679                 if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6))
 680                         prom_printf("statefile dev size %lu\n", space);
 681 
 682                 /*
 683                  * Export the estimated filesize info, this value will be
 684                  * compared before dumping out the statefile in the case of
 685                  * no compression.
 686                  */
 687                 STAT->cs_est_statefsz = size;
 688                 if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6))
 689                         prom_printf("%s Estimated statefile size %llu, "
 690                             "space %lu\n", str, size, space);
 691                 if (size > space) {
 692                         cpr_err(CE_CONT, "Statefile partition too small.");
 693                         return (ENOMEM);
 694                 }
 695                 return (0);
 696         } else {
 697                 if (CPR->c_alloc_cnt++ > C_MAX_ALLOC_RETRY) {
 698                         cpr_err(CE_CONT, "Statefile allocation retry failed\n");
 699                         return (ENOMEM);
 700                 }
 701 
 702                 /*
 703                  * Estimate space needed for the state file.
 704                  *
 705                  * State file size in bytes:
 706                  *      kernel size + non-cache pte seg +
 707                  *      bitmap size + cpr state file headers size
 708                  * (round up to fs->fs_bsize)
 709                  */
 710                 size = blkroundup(ip->i_fs, size);
 711 
 712                 /*
 713                  * Export the estimated filesize info, this value will be
 714                  * compared before dumping out the statefile in the case of
 715                  * no compression.
 716                  */
 717                 STAT->cs_est_statefsz = size;
 718                 error = cpr_grow_statefile(vp, size);
 719                 if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6)) {
 720                         rw_enter(&ip->i_contents, RW_READER);
 721                         isize = ip->i_size;
 722                         rw_exit(&ip->i_contents);
 723                         prom_printf("%s Estimated statefile size %lld, "
 724                             "i_size %lld\n", str, size, isize);
 725                 }
 726 
 727                 return (error);
 728         }
 729 }
 730 
 731 
 732 void
 733 cpr_statef_close(void)
 734 {
 735         if (C_VP) {
 736                 if (!cpr_reusable_mode)
 737                         (void) VOP_DUMPCTL(C_VP, DUMP_FREE, NULL, NULL);
 738                 (void) VOP_CLOSE(C_VP, FWRITE, 1, (offset_t)0, CRED(), NULL);
 739                 VN_RELE(C_VP);
 740                 C_VP = 0;
 741         }
 742 }
 743 
 744 
 745 /*
 746  * open cpr default file and display error
 747  */
 748 int
 749 cpr_open_deffile(int mode, vnode_t **vpp)
 750 {
 751         int error;
 752 
 753         if (error = cpr_open(cpr_default_path, mode, vpp))
 754                 cpr_err(CE_CONT, "cannot open \"%s\", error %d\n",
 755                     cpr_default_path, error);
 756         return (error);
 757 }
 758 
 759 
 760 /*
 761  * write cdef_t to disk.  This contains the original values of prom
 762  * properties that we modify.  We fill in the magic number of the file
 763  * here as a signal to the booter code that the state file is valid.
 764  * Be sure the file gets synced, since we may be shutting down the OS.
 765  */
 766 int
 767 cpr_write_deffile(cdef_t *cdef)
 768 {
 769         struct vnode *vp;
 770         char *str;
 771         int rc;
 772 
 773         if (rc = cpr_open_deffile(FCREAT|FWRITE, &vp))
 774                 return (rc);
 775 
 776         if (rc = cpr_rdwr(UIO_WRITE, vp, cdef, sizeof (*cdef)))
 777                 str = "write";
 778         else if (rc = VOP_FSYNC(vp, FSYNC, CRED(), NULL))
 779                 str = "fsync";
 780         (void) VOP_CLOSE(vp, FWRITE, 1, (offset_t)0, CRED(), NULL);
 781         VN_RELE(vp);
 782 
 783         if (rc) {
 784                 cpr_err(CE_WARN, "%s error %d, file \"%s\"",
 785                     str, rc, cpr_default_path);
 786         }
 787         return (rc);
 788 }
 789 
 790 /*
 791  * Clear the magic number in the defaults file.  This tells the booter
 792  * program that the state file is not current and thus prevents
 793  * any attempt to restore from an obsolete state file.
 794  */
 795 void
 796 cpr_clear_definfo(void)
 797 {
 798         struct vnode *vp;
 799         cmini_t mini;
 800 
 801         if ((CPR->c_cprboot_magic != CPR_DEFAULT_MAGIC) ||
 802             cpr_open_deffile(FCREAT|FWRITE, &vp))
 803                 return;
 804         mini.magic = mini.reusable = 0;
 805         (void) cpr_rdwr(UIO_WRITE, vp, &mini, sizeof (mini));
 806         (void) VOP_CLOSE(vp, FWRITE, 1, (offset_t)0, CRED(), NULL);
 807         VN_RELE(vp);
 808 }
 809 
 810 /*
 811  * If the cpr default file is invalid, then we must not be in reusable mode
 812  * if it is valid, it tells us our mode
 813  */
 814 int
 815 cpr_get_reusable_mode(void)
 816 {
 817         struct vnode *vp;
 818         cmini_t mini;
 819         int rc;
 820 
 821         if (cpr_open(cpr_default_path, FREAD, &vp))
 822                 return (0);
 823 
 824         rc = cpr_rdwr(UIO_READ, vp, &mini, sizeof (mini));
 825         (void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED(), NULL);
 826         VN_RELE(vp);
 827         if (rc == 0 && mini.magic == CPR_DEFAULT_MAGIC)
 828                 return (mini.reusable);
 829 
 830         return (0);
 831 }
 832 #endif
 833 
 834 /*
 835  * clock/time related routines
 836  */
 837 static time_t   cpr_time_stamp;
 838 
 839 
 840 void
 841 cpr_tod_get(cpr_time_t *ctp)
 842 {
 843         timestruc_t ts;
 844 
 845         mutex_enter(&tod_lock);
 846         ts = TODOP_GET(tod_ops);
 847         mutex_exit(&tod_lock);
 848         ctp->tv_sec = (time32_t)ts.tv_sec;
 849         ctp->tv_nsec = (int32_t)ts.tv_nsec;
 850 }
 851 
 852 void
 853 cpr_tod_status_set(int tod_flag)
 854 {
 855         mutex_enter(&tod_lock);
 856         tod_status_set(tod_flag);
 857         mutex_exit(&tod_lock);
 858 }
 859 
 860 void
 861 cpr_save_time(void)
 862 {
 863         cpr_time_stamp = gethrestime_sec();
 864 }
 865 
 866 /*
 867  * correct time based on saved time stamp or hardware clock
 868  */
 869 void
 870 cpr_restore_time(void)
 871 {
 872         clkset(cpr_time_stamp);
 873 }
 874 
 875 #if defined(__sparc)
 876 /*
 877  * CPU ONLINE/OFFLINE CODE
 878  */
 879 int
 880 cpr_mp_offline(void)
 881 {
 882         cpu_t *cp, *bootcpu;
 883         int rc = 0;
 884         int brought_up_boot = 0;
 885 
 886         /*
 887          * Do nothing for UP.
 888          */
 889         if (ncpus == 1)
 890                 return (0);
 891 
 892         mutex_enter(&cpu_lock);
 893 
 894         cpr_save_mp_state();
 895 
 896         bootcpu = i_cpr_bootcpu();
 897         if (!CPU_ACTIVE(bootcpu)) {
 898                 if ((rc = cpr_p_online(bootcpu, CPU_CPR_ONLINE))) {
 899                         mutex_exit(&cpu_lock);
 900                         return (rc);
 901                 }
 902                 brought_up_boot = 1;
 903         }
 904 
 905         cp = cpu_list;
 906         do {
 907                 if (cp == bootcpu)
 908                         continue;
 909                 if (cp->cpu_flags & CPU_OFFLINE)
 910                         continue;
 911                 if ((rc = cpr_p_online(cp, CPU_CPR_OFFLINE))) {
 912                         mutex_exit(&cpu_lock);
 913                         return (rc);
 914                 }
 915         } while ((cp = cp->cpu_next) != cpu_list);
 916         if (brought_up_boot && (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6)))
 917                 prom_printf("changed cpu %p to state %d\n",
 918                     (void *)bootcpu, CPU_CPR_ONLINE);
 919         mutex_exit(&cpu_lock);
 920 
 921         return (rc);
 922 }
 923 
 924 int
 925 cpr_mp_online(void)
 926 {
 927         cpu_t *cp, *bootcpu = CPU;
 928         int rc = 0;
 929 
 930         /*
 931          * Do nothing for UP.
 932          */
 933         if (ncpus == 1)
 934                 return (0);
 935 
 936         /*
 937          * cpr_save_mp_state() sets CPU_CPR_ONLINE in cpu_cpr_flags
 938          * to indicate a cpu was online at the time of cpr_suspend();
 939          * now restart those cpus that were marked as CPU_CPR_ONLINE
 940          * and actually are offline.
 941          */
 942         mutex_enter(&cpu_lock);
 943         for (cp = bootcpu->cpu_next; cp != bootcpu; cp = cp->cpu_next) {
 944                 /*
 945                  * Clear the CPU_FROZEN flag in all cases.
 946                  */
 947                 cp->cpu_flags &= ~CPU_FROZEN;
 948 
 949                 if (CPU_CPR_IS_OFFLINE(cp))
 950                         continue;
 951                 if (CPU_ACTIVE(cp))
 952                         continue;
 953                 if ((rc = cpr_p_online(cp, CPU_CPR_ONLINE))) {
 954                         mutex_exit(&cpu_lock);
 955                         return (rc);
 956                 }
 957         }
 958 
 959         /*
 960          * turn off the boot cpu if it was offlined
 961          */
 962         if (CPU_CPR_IS_OFFLINE(bootcpu)) {
 963                 if ((rc = cpr_p_online(bootcpu, CPU_CPR_OFFLINE))) {
 964                         mutex_exit(&cpu_lock);
 965                         return (rc);
 966                 }
 967         }
 968         mutex_exit(&cpu_lock);
 969         return (0);
 970 }
 971 
 972 static void
 973 cpr_save_mp_state(void)
 974 {
 975         cpu_t *cp;
 976 
 977         ASSERT(MUTEX_HELD(&cpu_lock));
 978 
 979         cp = cpu_list;
 980         do {
 981                 cp->cpu_cpr_flags &= ~CPU_CPR_ONLINE;
 982                 if (CPU_ACTIVE(cp))
 983                         CPU_SET_CPR_FLAGS(cp, CPU_CPR_ONLINE);
 984         } while ((cp = cp->cpu_next) != cpu_list);
 985 }
 986 
 987 /*
 988  * change cpu to online/offline
 989  */
 990 static int
 991 cpr_p_online(cpu_t *cp, int state)
 992 {
 993         int rc;
 994 
 995         ASSERT(MUTEX_HELD(&cpu_lock));
 996 
 997         switch (state) {
 998         case CPU_CPR_ONLINE:
 999                 rc = cpu_online(cp);
1000                 break;
1001         case CPU_CPR_OFFLINE:
1002                 rc = cpu_offline(cp, CPU_FORCED);
1003                 break;
1004         }
1005         if (rc) {
1006                 cpr_err(CE_WARN, "Failed to change processor %d to "
1007                     "state %d, (errno %d)", cp->cpu_id, state, rc);
1008         }
1009         return (rc);
1010 }
1011 
1012 /*
1013  * Construct the pathname of the state file and return a pointer to
1014  * caller.  Read the config file to get the mount point of the
1015  * filesystem and the pathname within fs.
1016  */
1017 char *
1018 cpr_build_statefile_path(void)
1019 {
1020         struct cprconfig *cf = &cprconfig;
1021 
1022         if (cpr_get_config())
1023                 return (NULL);
1024 
1025         switch (cf->cf_type) {
1026         case CFT_UFS:
1027                 if (strlen(cf->cf_path) + strlen(cf->cf_fs) >= MAXNAMELEN - 1) {
1028                         cpr_err(CE_CONT, "Statefile path is too long.\n");
1029                         return (NULL);
1030                 }
1031                 return (cpr_cprconfig_to_path());
1032         case CFT_ZVOL:
1033                 /*FALLTHROUGH*/
1034         case CFT_SPEC:
1035                 return (cf->cf_devfs);
1036         default:
1037                 cpr_err(CE_PANIC, "invalid statefile type");
1038                 /*NOTREACHED*/
1039                 return (NULL);
1040         }
1041 }
1042 
1043 int
1044 cpr_statefile_is_spec(void)
1045 {
1046         if (cpr_get_config())
1047                 return (0);
1048         return (cprconfig.cf_type == CFT_SPEC);
1049 }
1050 
1051 char *
1052 cpr_get_statefile_prom_path(void)
1053 {
1054         struct cprconfig *cf = &cprconfig;
1055 
1056         ASSERT(cprconfig_loaded);
1057         ASSERT(cf->cf_magic == CPR_CONFIG_MAGIC);
1058         ASSERT(cf->cf_type == CFT_SPEC || cf->cf_type == CFT_ZVOL);
1059         return (cf->cf_dev_prom);
1060 }
1061 
1062 
1063 /*
1064  * XXX The following routines need to be in the vfs source code.
1065  */
1066 
1067 int
1068 cpr_is_ufs(struct vfs *vfsp)
1069 {
1070         char *fsname;
1071 
1072         fsname = vfssw[vfsp->vfs_fstype].vsw_name;
1073         return (strcmp(fsname, "ufs") == 0);
1074 }
1075 
1076 int
1077 cpr_is_zfs(struct vfs *vfsp)
1078 {
1079         char *fsname;
1080 
1081         fsname = vfssw[vfsp->vfs_fstype].vsw_name;
1082         return (strcmp(fsname, "zfs") == 0);
1083 }
1084 
1085 /*
1086  * This is a list of file systems that are allowed to be writeable when a
1087  * reusable statefile checkpoint is taken.  They must not have any state that
1088  * cannot be restored to consistency by simply rebooting using the checkpoint.
1089  * (In contrast to ufs and pcfs which have disk state that could get
1090  * out of sync with the in-kernel data).
1091  */
1092 int
1093 cpr_reusable_mount_check(void)
1094 {
1095         struct vfs *vfsp;
1096         char *fsname;
1097         char **cpp;
1098         static char *cpr_writeok_fss[] = {
1099                 "autofs", "devfs", "fd", "lofs", "mntfs", "namefs", "nfs",
1100                 "proc", "tmpfs", "ctfs", "objfs", "dev", NULL
1101         };
1102 
1103         vfs_list_read_lock();
1104         vfsp = rootvfs;
1105         do {
1106                 if (vfsp->vfs_flag & VFS_RDONLY) {
1107                         vfsp = vfsp->vfs_next;
1108                         continue;
1109                 }
1110                 fsname = vfssw[vfsp->vfs_fstype].vsw_name;
1111                 for (cpp = cpr_writeok_fss; *cpp; cpp++) {
1112                         if (strcmp(fsname, *cpp) == 0)
1113                                 break;
1114                 }
1115                 /*
1116                  * if the inner loop reached the NULL terminator,
1117                  * the current fs-type does not match any OK-type
1118                  */
1119                 if (*cpp == NULL) {
1120                         cpr_err(CE_CONT, "a filesystem of type %s is "
1121                             "mounted read/write.\nReusable statefile requires "
1122                             "no writeable filesystem of this type be mounted\n",
1123                             fsname);
1124                         vfs_list_unlock();
1125                         return (EINVAL);
1126                 }
1127                 vfsp = vfsp->vfs_next;
1128         } while (vfsp != rootvfs);
1129         vfs_list_unlock();
1130         return (0);
1131 }
1132 
1133 /*
1134  * return statefile offset in DEV_BSIZE units
1135  */
1136 int
1137 cpr_statefile_offset(void)
1138 {
1139         return (cprconfig.cf_type != CFT_UFS ? btod(CPR_SPEC_OFFSET) : 0);
1140 }
1141 
1142 /*
1143  * Force a fresh read of the cprinfo per uadmin 3 call
1144  */
1145 void
1146 cpr_forget_cprconfig(void)
1147 {
1148         cprconfig_loaded = 0;
1149 }
1150 #endif