1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  * Copyright 2016 PALO, Richard.
  26  */
  27 
  28 #include <sys/balloon_impl.h>
  29 #include <sys/hypervisor.h>
  30 #include <xen/sys/xenbus_impl.h>
  31 #include <sys/atomic.h>
  32 #include <sys/cmn_err.h>
  33 #include <sys/disp.h>
  34 #include <sys/callb.h>
  35 #include <xen/public/memory.h>
  36 #include <vm/hat.h>
  37 #include <sys/promif.h>
  38 #include <vm/seg_kmem.h>
  39 #include <sys/memnode.h>
  40 #include <sys/param.h>
  41 #include <vm/vm_dep.h>
  42 #include <sys/mman.h>
  43 #include <sys/memlist.h>
  44 #include <sys/sysmacros.h>
  45 #include <sys/machsystm.h>
  46 #include <sys/sdt.h>
  47 
  48 /*
  49  * This file implements a balloon thread, which controls a domain's memory
  50  * reservation, or the amount of memory a domain is currently allocated.
  51  * The hypervisor provides the current memory reservation through xenbus,
  52  * so we register a watch on this.  We will then be signalled when the
  53  * reservation changes.  If it goes up, we map the new mfn's to our pfn's
  54  * (allocating page_t's if necessary), and release them into the system.
  55  * If the reservation goes down, we grab pages and release them back to
  56  * the hypervisor, saving the page_t's for later use.
  57  */
  58 
  59 /*
  60  * Various structures needed by the balloon thread
  61  */
  62 static bln_stats_t bln_stats;
  63 static kthread_t *bln_thread;
  64 static kmutex_t bln_mutex;
  65 static kcondvar_t bln_cv;
  66 static struct xenbus_watch bln_watch;
  67 static mfn_t new_high_mfn;
  68 
  69 /*
  70  * For holding spare page_t structures - keep a singly-linked list.
  71  * The list may hold both valid (pagenum < mfn_count) and invalid
  72  * (pagenum >= mfn_count) page_t's.  Valid page_t's should be inserted
  73  * at the front, and invalid page_t's at the back.  Removal should
  74  * always be from the front.  This is a singly-linked list using
  75  * p_next, so p_prev is always NULL.
  76  */
  77 static page_t *bln_spare_list_front, *bln_spare_list_back;
  78 
  79 int balloon_zero_memory = 1;
  80 size_t balloon_minkmem = (8 * 1024 * 1024);
  81 
  82 /*
  83  * reassign_pfn() calls update_contig_pfnlist(), which can cause a large
  84  * slowdown when calling multiple times.  If we're reassigning less than the
  85  * quota defined here, we just accept the slowdown.  If the count is greater
  86  * than the quota, we tell the contig alloc code to stop its accounting until
  87  * we're done.  Setting the quota to less than 2 is not supported.
  88  *
  89  * Note that we define our own wrapper around the external
  90  * clear_and_lock_contig_pfnlist(), but we just use the version of
  91  * unlock_contig_pfnlist() in vm_machdep.c.
  92  */
  93 uint_t bln_contig_list_quota = 50;
  94 
  95 extern void clear_and_lock_contig_pfnlist(void);
  96 extern void unlock_contig_pfnlist(void);
  97 
  98 /*
  99  * Lock the pfnlist if necessary (see above), and return whether we locked it.
 100  */
 101 static int
 102 balloon_lock_contig_pfnlist(int count)
 103 {
 104         if (count > bln_contig_list_quota) {
 105                 clear_and_lock_contig_pfnlist();
 106                 return (1);
 107         } else {
 108                 return (0);
 109         }
 110 }
 111 
 112 /*
 113  * The page represented by pp is being given back to the hypervisor.
 114  * Add the page_t structure to our spare list.
 115  */
 116 static void
 117 balloon_page_add(page_t *pp)
 118 {
 119         /*
 120          * We need to keep the page exclusively locked
 121          * to prevent swrand from grabbing it.
 122          */
 123         ASSERT(PAGE_EXCL(pp));
 124         ASSERT(MUTEX_HELD(&bln_mutex));
 125 
 126         pp->p_prev = NULL;
 127         if (bln_spare_list_front == NULL) {
 128                 bln_spare_list_front = bln_spare_list_back = pp;
 129                 pp->p_next = NULL;
 130         } else if (pp->p_pagenum >= mfn_count) {
 131                 /*
 132                  * The pfn is invalid, so add at the end of list.  Since these
 133                  * adds should *only* be done by balloon_init_new_pages(), and
 134                  * that does adds in order, the following ASSERT should
 135                  * never trigger.
 136                  */
 137                 ASSERT(pp->p_pagenum > bln_spare_list_back->p_pagenum);
 138                 bln_spare_list_back->p_next = pp;
 139                 pp->p_next = NULL;
 140                 bln_spare_list_back = pp;
 141         } else {
 142                 /* Add at beginning of list */
 143                 pp->p_next = bln_spare_list_front;
 144                 bln_spare_list_front = pp;
 145         }
 146 }
 147 
 148 /*
 149  * Return a page_t structure from our spare list, or NULL if none are available.
 150  */
 151 static page_t *
 152 balloon_page_sub(void)
 153 {
 154         page_t *pp;
 155 
 156         ASSERT(MUTEX_HELD(&bln_mutex));
 157         if (bln_spare_list_front == NULL) {
 158                 return (NULL);
 159         }
 160 
 161         pp = bln_spare_list_front;
 162         ASSERT(PAGE_EXCL(pp));
 163         ASSERT(pp->p_pagenum <= mfn_count);
 164         if (pp->p_pagenum == mfn_count) {
 165                 return (NULL);
 166         }
 167 
 168         bln_spare_list_front = pp->p_next;
 169         if (bln_spare_list_front == NULL)
 170                 bln_spare_list_back = NULL;
 171         pp->p_next = NULL;
 172         return (pp);
 173 }
 174 
 175 /*
 176  * NOTE: We currently do not support growing beyond the boot memory size,
 177  * so the following function will not be called.  It is left in here with
 178  * the hope that someday this restriction can be lifted, and this code can
 179  * be used.
 180  */
 181 
 182 /*
 183  * This structure is placed at the start of every block of new pages
 184  */
 185 typedef struct {
 186         struct memseg   memseg;
 187         struct memlist  memlist;
 188         page_t          pages[1];
 189 } mem_structs_t;
 190 
 191 /*
 192  * To make the math below slightly less confusing, we calculate the first
 193  * two parts here.  page_t's are handled separately, so they are not included.
 194  */
 195 #define MEM_STRUCT_SIZE (sizeof (struct memseg) + sizeof (struct memlist))
 196 
 197 /*
 198  * We want to add memory, but have no spare page_t structures.  Use some of
 199  * our new memory for the page_t structures.
 200  *
 201  * Somewhat similar to kphysm_add_memory_dynamic(), but simpler.
 202  */
 203 static int
 204 balloon_init_new_pages(mfn_t framelist[], pgcnt_t count)
 205 {
 206         pgcnt_t metapgs, totalpgs, num_pages;
 207         paddr_t metasz;
 208         pfn_t   meta_start;
 209         page_t  *page_array;
 210         caddr_t va;
 211         int     i, rv, locked;
 212         mem_structs_t *mem;
 213         struct memseg *segp;
 214 
 215         /* Calculate the number of pages we're going to add */
 216         totalpgs = bln_stats.bln_new_target - bln_stats.bln_current_pages;
 217 
 218         /*
 219          * The following calculates the number of "meta" pages -- the pages
 220          * that will be required to hold page_t structures for all new pages.
 221          * Proof of this calculation is left up to the reader.
 222          */
 223         metapgs = totalpgs - (((uint64_t)(totalpgs) << PAGESHIFT) /
 224             (PAGESIZE + sizeof (page_t)));
 225 
 226         /*
 227          * Given the number of page_t structures we need, is there also
 228          * room in our meta pages for a memseg and memlist struct?
 229          * If not, we'll need one more meta page.
 230          */
 231         if ((metapgs << PAGESHIFT) < (totalpgs * sizeof (page_t) +
 232             MEM_STRUCT_SIZE))
 233                 metapgs++;
 234 
 235         /*
 236          * metapgs is calculated from totalpgs, which may be much larger than
 237          * count.  If we don't have enough pages, all of the pages in this
 238          * batch will be made meta pages, and a future trip through
 239          * balloon_inc_reservation() will add the rest of the meta pages.
 240          */
 241         if (metapgs > count)
 242                 metapgs = count;
 243 
 244         /*
 245          * Figure out the number of page_t structures that can fit in metapgs
 246          *
 247          * This will cause us to initialize more page_t structures than we
 248          * need - these may be used in future memory increases.
 249          */
 250         metasz = pfn_to_pa(metapgs);
 251         num_pages = (metasz - MEM_STRUCT_SIZE) / sizeof (page_t);
 252 
 253         DTRACE_PROBE3(balloon__alloc__stats, pgcnt_t, totalpgs, pgcnt_t,
 254             num_pages, pgcnt_t, metapgs);
 255 
 256         /*
 257          * We only increment mfn_count by count, not num_pages, to keep the
 258          * space of all valid pfns contiguous.  This means we create page_t
 259          * structures with invalid pagenums -- we deal with this situation
 260          * in balloon_page_sub.
 261          */
 262         mfn_count += count;
 263 
 264         /*
 265          * Get a VA for the pages that will hold page_t and other structures.
 266          * The memseg and memlist structures will go at the beginning, with
 267          * the page_t structures following.
 268          */
 269         va = (caddr_t)vmem_alloc(heap_arena, metasz, VM_SLEEP);
 270         /* LINTED: improper alignment */
 271         mem = (mem_structs_t *)va;
 272         page_array = mem->pages;
 273 
 274         meta_start = bln_stats.bln_max_pages;
 275 
 276         /*
 277          * Set the mfn to pfn mapping for the meta pages.
 278          */
 279         locked = balloon_lock_contig_pfnlist(metapgs);
 280         for (i = 0; i < metapgs; i++) {
 281                 reassign_pfn(bln_stats.bln_max_pages + i, framelist[i]);
 282         }
 283         if (locked)
 284                 unlock_contig_pfnlist();
 285 
 286         /*
 287          * For our meta pages, map them in and zero the page.
 288          * This will be the first time touching the new pages.
 289          */
 290         hat_devload(kas.a_hat, va, metasz, bln_stats.bln_max_pages,
 291             PROT_READ | PROT_WRITE,
 292             HAT_LOAD | HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST);
 293         bzero(va, metasz);
 294 
 295         /*
 296          * Initialize the page array for the new pages.
 297          */
 298         for (i = 0; i < metapgs; i++) {
 299                 page_array[i].p_pagenum = bln_stats.bln_max_pages++;
 300                 page_array[i].p_offset = (u_offset_t)-1;
 301                 page_iolock_init(&page_array[i]);
 302                 rv = page_lock(&page_array[i], SE_EXCL, NULL, P_NO_RECLAIM);
 303                 ASSERT(rv == 1);
 304         }
 305 
 306         /*
 307          * For the rest of the pages, initialize the page_t struct and
 308          * add them to the free list
 309          */
 310         for (i = metapgs; i < num_pages; i++) {
 311                 page_array[i].p_pagenum = bln_stats.bln_max_pages++;
 312                 page_array[i].p_offset = (u_offset_t)-1;
 313                 page_iolock_init(&page_array[i]);
 314                 rv = page_lock(&page_array[i], SE_EXCL, NULL, P_NO_RECLAIM);
 315                 ASSERT(rv == 1);
 316                 balloon_page_add(&page_array[i]);
 317         }
 318 
 319         /*
 320          * Remember where I said that we don't call this function?  The missing
 321          * code right here is why.  We need to set up kpm mappings for any new
 322          * pages coming in.  However, if someone starts up a domain with small
 323          * memory, then greatly increases it, we could get in some horrible
 324          * deadlock situations as we steal page tables for kpm use, and
 325          * userland applications take them right back before we can use them
 326          * to set up our new memory.  Once a way around that is found, and a
 327          * few other changes are made, we'll be able to enable this code.
 328          */
 329 
 330         /*
 331          * Update kernel structures, part 1: memsegs list
 332          */
 333         mem->memseg.pages_base = meta_start;
 334         mem->memseg.pages_end = bln_stats.bln_max_pages - 1;
 335         mem->memseg.pages = &page_array[0];
 336         mem->memseg.epages = &page_array[num_pages - 1];
 337         mem->memseg.next = NULL;
 338         memsegs_lock(1);
 339         for (segp = memsegs; segp->next != NULL; segp = segp->next)
 340                 ;
 341         segp->next = &mem->memseg;
 342         memsegs_unlock(1);
 343 
 344         /*
 345          * Update kernel structures, part 2: mem_node array
 346          */
 347         mem_node_add_slice(meta_start, bln_stats.bln_max_pages);
 348 
 349         /*
 350          * Update kernel structures, part 3: phys_install array
 351          * (*sigh* how many of these things do we need?)
 352          */
 353         memlist_write_lock();
 354         memlist_add(pfn_to_pa(meta_start), num_pages, &mem->memlist,
 355             &phys_install);
 356         memlist_write_unlock();
 357 
 358         build_pfn_hash();
 359 
 360         return (metapgs);
 361 }
 362 
 363 /* How many ulong_t's can we fit on a page? */
 364 #define FRAME_ARRAY_SIZE        (PAGESIZE / sizeof (ulong_t))
 365 
 366 /*
 367  * These are too large to declare on the stack, so we make them static instead
 368  */
 369 static ulong_t  mfn_frames[FRAME_ARRAY_SIZE];
 370 static pfn_t    pfn_frames[FRAME_ARRAY_SIZE];
 371 
 372 /*
 373  * This function is called when our reservation is increasing.  Make a
 374  * hypervisor call to get our new pages, then integrate them into the system.
 375  */
 376 static spgcnt_t
 377 balloon_inc_reservation(ulong_t credit)
 378 {
 379         int     i, cnt, locked;
 380         int     meta_pg_start, meta_pg_end;
 381         long    rv;
 382         page_t  *pp;
 383         page_t  *new_list_front, *new_list_back;
 384 
 385         /* Make sure we're single-threaded. */
 386         ASSERT(MUTEX_HELD(&bln_mutex));
 387 
 388         rv = 0;
 389         new_list_front = new_list_back = NULL;
 390         meta_pg_start = meta_pg_end = 0;
 391         bzero(mfn_frames, PAGESIZE);
 392 
 393         if (credit > FRAME_ARRAY_SIZE)
 394                 credit = FRAME_ARRAY_SIZE;
 395 
 396         xen_block_migrate();
 397         rv = balloon_alloc_pages(credit, mfn_frames);
 398 
 399         if (rv < 0) {
 400                 xen_allow_migrate();
 401                 return (0);
 402         }
 403         for (i = 0; i < rv; i++) {
 404                 if (mfn_frames[i] > new_high_mfn)
 405                         new_high_mfn = mfn_frames[i];
 406 
 407                 pp = balloon_page_sub();
 408                 if (pp == NULL) {
 409                         /*
 410                          * We pass the index into the current mfn array,
 411                          * then move the counter past the mfns we used
 412                          */
 413                         meta_pg_start = i;
 414                         cnt = balloon_init_new_pages(&mfn_frames[i], rv - i);
 415                         i += cnt;
 416                         meta_pg_end = i;
 417                         if (i < rv) {
 418                                 pp = balloon_page_sub();
 419                         } else {
 420                                 ASSERT(i == rv);
 421                         }
 422                 }
 423                 if (pp == NULL) {
 424                         break;
 425                 }
 426 
 427                 if (new_list_back == NULL) {
 428                         new_list_front = new_list_back = pp;
 429                 } else {
 430                         new_list_back->p_next = pp;
 431                         new_list_back = pp;
 432                 }
 433                 pp->p_next = NULL;
 434         }
 435         cnt = i;
 436         locked = balloon_lock_contig_pfnlist(cnt);
 437         for (i = 0, pp = new_list_front; i < meta_pg_start;
 438             i++, pp = pp->p_next) {
 439                 reassign_pfn(pp->p_pagenum, mfn_frames[i]);
 440         }
 441         for (i = meta_pg_end; i < cnt; i++, pp = pp->p_next) {
 442                 reassign_pfn(pp->p_pagenum, mfn_frames[i]);
 443         }
 444         if (locked)
 445                 unlock_contig_pfnlist();
 446 
 447         /*
 448          * Make sure we don't allow pages without pfn->mfn mappings
 449          * into the system.
 450          */
 451         ASSERT(pp == NULL);
 452 
 453         while (new_list_front != NULL) {
 454                 pp = new_list_front;
 455                 new_list_front = pp->p_next;
 456                 page_free(pp, 1);
 457         }
 458 
 459         /*
 460          * Variable review: at this point, rv contains the number of pages
 461          * the hypervisor gave us.  cnt contains the number of pages for which
 462          * we had page_t structures.  i contains the number of pages
 463          * where we set up pfn <-> mfn mappings.  If this ASSERT trips, that
 464          * means we somehow lost page_t's from our local list.
 465          */
 466         ASSERT(cnt == i);
 467         if (cnt < rv) {
 468                 /*
 469                  * We couldn't get page structures.
 470                  *
 471                  * This shouldn't happen, but causes no real harm if it does.
 472                  * On debug kernels, we'll flag it.  On all kernels, we'll
 473                  * give back the pages we couldn't assign.
 474                  *
 475                  * Since these pages are new to the system and haven't been
 476                  * used, we don't bother zeroing them.
 477                  */
 478 #ifdef DEBUG
 479                 cmn_err(CE_WARN, "Could only assign %d of %ld pages", cnt, rv);
 480 #endif  /* DEBUG */
 481 
 482                 (void) balloon_free_pages(rv - cnt, &mfn_frames[i], NULL, NULL);
 483 
 484                 rv = cnt;
 485         }
 486 
 487         xen_allow_migrate();
 488         page_unresv(rv - (meta_pg_end - meta_pg_start));
 489         return (rv);
 490 }
 491 
 492 /*
 493  * This function is called when we want to decrease the memory reservation
 494  * of our domain.  Allocate the memory and make a hypervisor call to give
 495  * it back.
 496  */
 497 static spgcnt_t
 498 balloon_dec_reservation(ulong_t debit)
 499 {
 500         int     i, locked;
 501         long    rv;
 502         ulong_t request;
 503         page_t  *pp;
 504 
 505         bzero(mfn_frames, sizeof (mfn_frames));
 506         bzero(pfn_frames, sizeof (pfn_frames));
 507 
 508         if (debit > FRAME_ARRAY_SIZE) {
 509                 debit = FRAME_ARRAY_SIZE;
 510         }
 511         request = debit;
 512 
 513         /*
 514          * Don't bother if there isn't a safe amount of kmem left.
 515          */
 516         if (kmem_avail() < balloon_minkmem) {
 517                 kmem_reap();
 518                 if (kmem_avail() < balloon_minkmem)
 519                         return (0);
 520         }
 521 
 522         if (page_resv(request, KM_NOSLEEP) == 0) {
 523                 return (0);
 524         }
 525         xen_block_migrate();
 526         for (i = 0; i < debit; i++) {
 527                 pp = page_get_high_mfn(new_high_mfn);
 528                 new_high_mfn = 0;
 529                 if (pp == NULL) {
 530                         /*
 531                          * Call kmem_reap(), then try once more,
 532                          * but only if there is a safe amount of
 533                          * kmem left.
 534                          */
 535                         kmem_reap();
 536                         if (kmem_avail() < balloon_minkmem ||
 537                             (pp = page_get_high_mfn(0)) == NULL) {
 538                                 debit = i;
 539                                 break;
 540                         }
 541                 }
 542                 ASSERT(PAGE_EXCL(pp));
 543                 ASSERT(!hat_page_is_mapped(pp));
 544 
 545                 balloon_page_add(pp);
 546                 pfn_frames[i] = pp->p_pagenum;
 547                 mfn_frames[i] = pfn_to_mfn(pp->p_pagenum);
 548         }
 549         if (debit == 0) {
 550                 xen_allow_migrate();
 551                 page_unresv(request);
 552                 return (0);
 553         }
 554 
 555         /*
 556          * We zero all the pages before we start reassigning them in order to
 557          * minimize the time spent holding the lock on the contig pfn list.
 558          */
 559         if (balloon_zero_memory) {
 560                 for (i = 0; i < debit; i++) {
 561                         pfnzero(pfn_frames[i], 0, PAGESIZE);
 562                 }
 563         }
 564 
 565         /*
 566          * Remove all mappings for the pfns from the system
 567          */
 568         locked = balloon_lock_contig_pfnlist(debit);
 569         for (i = 0; i < debit; i++) {
 570                 reassign_pfn(pfn_frames[i], MFN_INVALID);
 571         }
 572         if (locked)
 573                 unlock_contig_pfnlist();
 574 
 575         rv = balloon_free_pages(debit, mfn_frames, NULL, NULL);
 576 
 577         if (rv < 0) {
 578                 cmn_err(CE_WARN, "Attempt to return pages to the hypervisor "
 579                     "failed - up to %lu pages lost (error = %ld)", debit, rv);
 580                 rv = 0;
 581         } else if (rv != debit) {
 582                 panic("Unexpected return value (%ld) from decrease reservation "
 583                     "hypervisor call", rv);
 584         }
 585 
 586         xen_allow_migrate();
 587         if (debit != request)
 588                 page_unresv(request - debit);
 589         return (rv);
 590 }
 591 
 592 /*
 593  * This function is the callback which is called when the memory/target
 594  * node is changed.  When it is fired, we will read a new reservation
 595  * target for our domain and signal the worker thread to make the change.
 596  *
 597  * If the reservation is larger than we can handle, we issue a warning.  dom0
 598  * does this automatically every boot, so we skip the first warning on dom0.
 599  */
 600 /*ARGSUSED*/
 601 static void
 602 balloon_handler(struct xenbus_watch *watch, const char **vec, uint_t len)
 603 {
 604         ulong_t new_target_kb;
 605         pgcnt_t new_target_pages;
 606         int rv;
 607         static uchar_t warning_cnt = 0;
 608 
 609         rv = xenbus_scanf(0, "memory", "target", "%lu", &new_target_kb);
 610         if (rv != 0) {
 611                 return;
 612         }
 613 
 614         /* new_target is in kB - change this to pages */
 615         new_target_pages = kbtop(new_target_kb);
 616 
 617         DTRACE_PROBE1(balloon__new__target, pgcnt_t, new_target_pages);
 618 
 619         /*
 620          * Unfortunately, dom0 may give us a target that is larger than
 621          * our max limit.  Re-check the limit, and, if the new target is
 622          * too large, adjust it downwards.
 623          */
 624         mutex_enter(&bln_mutex);
 625         if (new_target_pages > bln_stats.bln_max_pages) {
 626                 DTRACE_PROBE2(balloon__target__too__large, pgcnt_t,
 627                     new_target_pages, pgcnt_t, bln_stats.bln_max_pages);
 628                 if (!DOMAIN_IS_INITDOMAIN(xen_info) || warning_cnt != 0) {
 629                         cmn_err(CE_WARN, "New balloon target (0x%lx pages) is "
 630                             "larger than original memory size (0x%lx pages). "
 631                             "Ballooning beyond original memory size is not "
 632                             "allowed.",
 633                             new_target_pages, bln_stats.bln_max_pages);
 634                 }
 635                 warning_cnt = 1;
 636                 bln_stats.bln_new_target = bln_stats.bln_max_pages;
 637         } else {
 638                 bln_stats.bln_new_target = new_target_pages;
 639         }
 640 
 641         mutex_exit(&bln_mutex);
 642         cv_signal(&bln_cv);
 643 }
 644 
 645 /*
 646  * bln_wait_sec can be used to throttle the hv calls, but by default it's
 647  * turned off.  If a balloon attempt fails, the wait time is forced on, and
 648  * then is exponentially increased as further attempts fail.
 649  */
 650 uint_t bln_wait_sec = 0;
 651 uint_t bln_wait_shift = 1;
 652 
 653 /*
 654  * This is the main balloon thread.  Wait on the cv.  When woken, if our
 655  * reservation has changed, call the appropriate function to adjust the
 656  * reservation.
 657  */
 658 static void
 659 balloon_worker_thread(void)
 660 {
 661         uint_t          bln_wait;
 662         callb_cpr_t     cprinfo;
 663         spgcnt_t        rv;
 664 
 665         bln_wait = bln_wait_sec;
 666 
 667         CALLB_CPR_INIT(&cprinfo, &bln_mutex, callb_generic_cpr, "balloon");
 668         for (;;) {
 669                 rv = 0;
 670 
 671                 mutex_enter(&bln_mutex);
 672                 CALLB_CPR_SAFE_BEGIN(&cprinfo);
 673                 if (bln_stats.bln_new_target != bln_stats.bln_current_pages) {
 674                         /*
 675                          * We weren't able to fully complete the request
 676                          * last time through, so try again.
 677                          */
 678                         (void) cv_reltimedwait(&bln_cv, &bln_mutex,
 679                             (bln_wait * hz), TR_CLOCK_TICK);
 680                 } else {
 681                         cv_wait(&bln_cv, &bln_mutex);
 682                 }
 683                 CALLB_CPR_SAFE_END(&cprinfo, &bln_mutex);
 684 
 685                 if (bln_stats.bln_new_target != bln_stats.bln_current_pages) {
 686                         if (bln_stats.bln_new_target <
 687                             bln_stats.bln_current_pages) {
 688                                 /* reservation shrunk */
 689                                 rv = -balloon_dec_reservation(
 690                                     bln_stats.bln_current_pages -
 691                                     bln_stats.bln_new_target);
 692                         } else if (bln_stats.bln_new_target >
 693                             bln_stats.bln_current_pages) {
 694                                 /* reservation grew */
 695                                 rv = balloon_inc_reservation(
 696                                     bln_stats.bln_new_target -
 697                                     bln_stats.bln_current_pages);
 698                         }
 699                 }
 700                 if (rv == 0) {
 701                         if (bln_wait == 0) {
 702                                 bln_wait = 1;
 703                         } else {
 704                                 bln_wait <<= bln_wait_shift;
 705                         }
 706                 } else {
 707                         bln_stats.bln_current_pages += rv;
 708                         bln_wait = bln_wait_sec;
 709                 }
 710                 if (bln_stats.bln_current_pages < bln_stats.bln_low)
 711                         bln_stats.bln_low = bln_stats.bln_current_pages;
 712                 else if (bln_stats.bln_current_pages > bln_stats.bln_high)
 713                         bln_stats.bln_high = bln_stats.bln_current_pages;
 714                 mutex_exit(&bln_mutex);
 715         }
 716 }
 717 
 718 /*
 719  * Called after balloon_init(), which is below.  The xenbus thread is up
 720  * and running, so we can register our watch and create the balloon thread.
 721  */
 722 static void
 723 balloon_config_watch(int state)
 724 {
 725         if (state != XENSTORE_UP)
 726                 return;
 727 
 728         bln_watch.node = "memory/target";
 729         bln_watch.callback = balloon_handler;
 730         if (register_xenbus_watch(&bln_watch)) {
 731                 cmn_err(CE_WARN, "Failed to register balloon watcher; balloon "
 732                     "thread will be disabled");
 733                 return;
 734         }
 735 
 736         if (bln_thread == NULL)
 737                 bln_thread = thread_create(NULL, 0, balloon_worker_thread,
 738                     NULL, 0, &p0, TS_RUN, minclsyspri);
 739 }
 740 
 741 /*
 742  * Basic initialization of the balloon thread.  Set all of our variables,
 743  * and register a callback for later when we can register a xenbus watch.
 744  */
 745 void
 746 balloon_init(pgcnt_t nr_pages)
 747 {
 748         domid_t domid = DOMID_SELF;
 749 
 750         bln_stats.bln_current_pages = bln_stats.bln_low = nr_pages;
 751         bln_stats.bln_new_target = bln_stats.bln_high = nr_pages;
 752         bln_stats.bln_max_pages = nr_pages;
 753         cv_init(&bln_cv, NULL, CV_DEFAULT, NULL);
 754 
 755         bln_stats.bln_hard_limit = (spgcnt_t)HYPERVISOR_memory_op(
 756             XENMEM_maximum_reservation, &domid);
 757 
 758         (void) xs_register_xenbus_callback(balloon_config_watch);
 759 }
 760 
 761 /*
 762  * These functions are called from the network drivers when they gain a page
 763  * or give one away.  We simply update our count.  Note that the counter
 764  * tracks the number of pages we give away, so we need to subtract any
 765  * amount passed to balloon_drv_added.
 766  */
 767 void
 768 balloon_drv_added(int64_t delta)
 769 {
 770         atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, -delta);
 771 }
 772 
 773 void
 774 balloon_drv_subtracted(int64_t delta)
 775 {
 776         atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, delta);
 777 }
 778 
 779 /*
 780  * balloon_alloc_pages()
 781  *      Allocate page_cnt mfns.  mfns storage provided by the caller.  Returns
 782  *      the number of pages allocated, which could be less than page_cnt, or
 783  *      a negative number if an error occurred.
 784  */
 785 long
 786 balloon_alloc_pages(uint_t page_cnt, mfn_t *mfns)
 787 {
 788         xen_memory_reservation_t memres;
 789         long rv;
 790 
 791         bzero(&memres, sizeof (memres));
 792         /*LINTED: constant in conditional context*/
 793         set_xen_guest_handle(memres.extent_start, mfns);
 794         memres.domid = DOMID_SELF;
 795         memres.nr_extents = page_cnt;
 796 
 797         rv = HYPERVISOR_memory_op(XENMEM_increase_reservation, &memres);
 798         if (rv > 0)
 799                 atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, -rv);
 800         return (rv);
 801 }
 802 
 803 /*
 804  * balloon_free_pages()
 805  *    free page_cnt pages, using any combination of mfns, pfns, and kva as long
 806  *    as they refer to the same mapping.  If an array of mfns is passed in, we
 807  *    assume they were already cleared.  Otherwise, we need to zero the pages
 808  *    before giving them back to the hypervisor. kva space is not free'd up in
 809  *    case the caller wants to re-use it.
 810  */
 811 long
 812 balloon_free_pages(uint_t page_cnt, mfn_t *mfns, caddr_t kva, pfn_t *pfns)
 813 {
 814         xen_memory_reservation_t memdec;
 815         mfn_t mfn;
 816         pfn_t pfn;
 817         uint_t i;
 818         long e;
 819 
 820 
 821 #if DEBUG
 822         /* make sure kva is page aligned and maps to first pfn */
 823         if (kva != NULL) {
 824                 ASSERT(((uintptr_t)kva & PAGEOFFSET) == 0);
 825                 if (pfns != NULL) {
 826                         ASSERT(hat_getpfnum(kas.a_hat, kva) == pfns[0]);
 827                 }
 828         }
 829 #endif
 830 
 831         /* if we have a kva, we can clean all pages with just one bzero */
 832         if ((kva != NULL) && balloon_zero_memory) {
 833                 bzero(kva, (page_cnt * PAGESIZE));
 834         }
 835 
 836         /* if we were given a kva and/or a pfn */
 837         if ((kva != NULL) || (pfns != NULL)) {
 838 
 839                 /*
 840                  * All the current callers only pass 1 page when using kva or
 841                  * pfns, and use mfns when passing multiple pages.  If that
 842                  * assumption is changed, the following code will need some
 843                  * work.  The following ASSERT() guarantees we're respecting
 844                  * the io locking quota.
 845                  */
 846                 ASSERT(page_cnt < bln_contig_list_quota);
 847 
 848                 /* go through all the pages */
 849                 for (i = 0; i < page_cnt; i++) {
 850 
 851                         /* get the next pfn */
 852                         if (pfns == NULL) {
 853                                 pfn = hat_getpfnum(kas.a_hat,
 854                                     (kva + (PAGESIZE * i)));
 855                         } else {
 856                                 pfn = pfns[i];
 857                         }
 858 
 859                         /*
 860                          * if we didn't already zero this page, do it now. we
 861                          * need to do this *before* we give back the MFN
 862                          */
 863                         if ((kva == NULL) && (balloon_zero_memory)) {
 864                                 pfnzero(pfn, 0, PAGESIZE);
 865                         }
 866 
 867                         /*
 868                          * unmap the pfn. We don't free up the kva vmem space
 869                          * so the caller can re-use it. The page must be
 870                          * unmapped before it is given back to the hypervisor.
 871                          */
 872                         if (kva != NULL) {
 873                                 hat_unload(kas.a_hat, (kva + (PAGESIZE * i)),
 874                                     PAGESIZE, HAT_UNLOAD_UNMAP);
 875                         }
 876 
 877                         /* grab the mfn before the pfn is marked as invalid */
 878                         mfn = pfn_to_mfn(pfn);
 879 
 880                         /* mark the pfn as invalid */
 881                         reassign_pfn(pfn, MFN_INVALID);
 882 
 883                         /*
 884                          * if we weren't given an array of MFNs, we need to
 885                          * free them up one at a time. Otherwise, we'll wait
 886                          * until later and do it in one hypercall
 887                          */
 888                         if (mfns == NULL) {
 889                                 bzero(&memdec, sizeof (memdec));
 890                                 /*LINTED: constant in conditional context*/
 891                                 set_xen_guest_handle(memdec.extent_start, &mfn);
 892                                 memdec.domid = DOMID_SELF;
 893                                 memdec.nr_extents = 1;
 894                                 e = HYPERVISOR_memory_op(
 895                                     XENMEM_decrease_reservation, &memdec);
 896                                 if (e != 1) {
 897                                         cmn_err(CE_PANIC, "balloon: unable to "
 898                                             "give a page back to the "
 899                                             "hypervisor.\n");
 900                                 }
 901                         }
 902                 }
 903         }
 904 
 905         /*
 906          * if we were passed in MFNs, we haven't free'd them up yet. We can
 907          * do it with one call.
 908          */
 909         if (mfns != NULL) {
 910                 bzero(&memdec, sizeof (memdec));
 911                 /*LINTED: constant in conditional context*/
 912                 set_xen_guest_handle(memdec.extent_start, mfns);
 913                 memdec.domid = DOMID_SELF;
 914                 memdec.nr_extents = page_cnt;
 915                 e = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &memdec);
 916                 if (e != page_cnt) {
 917                         cmn_err(CE_PANIC, "balloon: unable to give pages back "
 918                             "to the hypervisor.\n");
 919                 }
 920         }
 921 
 922         atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, page_cnt);
 923         return (page_cnt);
 924 }
 925 
 926 
 927 /*
 928  * balloon_replace_pages()
 929  *      Try to replace nextexts blocks of 2^order pages.  addr_bits specifies
 930  *      how many bits of address the pages must be within (i.e. 16 would mean
 931  *      that the pages cannot have an address > 64k).  The constrints are on
 932  *      what the hypervisor gives us -- we are free to give any pages in
 933  *      exchange.  The array pp is the pages we are giving away.  The caller
 934  *      provides storage space for mfns, which hold the new physical pages.
 935  */
 936 long
 937 balloon_replace_pages(uint_t nextents, page_t **pp, uint_t addr_bits,
 938     uint_t order, mfn_t *mfns)
 939 {
 940         xen_memory_reservation_t memres;
 941         long fallback_cnt;
 942         long cnt;
 943         uint_t i, j, page_cnt, extlen;
 944         long e;
 945         int locked;
 946 
 947 
 948         /*
 949          * we shouldn't be allocating constrained pages on a guest. It doesn't
 950          * make any sense. They won't be constrained after a migration.
 951          */
 952         ASSERT(DOMAIN_IS_INITDOMAIN(xen_info));
 953 
 954         extlen = 1 << order;
 955         page_cnt = nextents * extlen;
 956         /* Give back the current pages to the hypervisor */
 957         for (i = 0; i < page_cnt; i++) {
 958                 cnt = balloon_free_pages(1, NULL, NULL, &pp[i]->p_pagenum);
 959                 if (cnt != 1) {
 960                         cmn_err(CE_PANIC, "balloon: unable to give a page back "
 961                             "to the hypervisor.\n");
 962                 }
 963         }
 964 
 965         /*
 966          * try to allocate the new pages using addr_bits and order. If we can't
 967          * get all of the pages, try to get the remaining pages with no
 968          * constraints and, if that was successful, return the number of
 969          * constrained pages we did allocate.
 970          */
 971         bzero(&memres, sizeof (memres));
 972         /*LINTED: constant in conditional context*/
 973         set_xen_guest_handle(memres.extent_start, mfns);
 974         memres.domid = DOMID_SELF;
 975         memres.nr_extents = nextents;
 976         memres.mem_flags = XENMEMF_address_bits(addr_bits);
 977         memres.extent_order = order;
 978         cnt = HYPERVISOR_memory_op(XENMEM_increase_reservation, &memres);
 979         /* assign the new MFNs to the current PFNs */
 980         locked = balloon_lock_contig_pfnlist(cnt * extlen);
 981         for (i = 0; i < cnt; i++) {
 982                 for (j = 0; j < extlen; j++) {
 983                         reassign_pfn(pp[i * extlen + j]->p_pagenum,
 984                             mfns[i] + j);
 985                 }
 986         }
 987         if (locked)
 988                 unlock_contig_pfnlist();
 989         if (cnt != nextents) {
 990                 if (cnt < 0) {
 991                         cnt = 0;
 992                 }
 993 
 994                 /*
 995                  * We couldn't get enough memory to satisfy our requirements.
 996                  * The above loop will assign the parts of the request that
 997                  * were successful (this part may be 0).  We need to fill
 998                  * in the rest.  The bzero below clears out extent_order and
 999                  * address_bits, so we'll take anything from the hypervisor
1000                  * to replace the pages we gave away.
1001                  */
1002                 fallback_cnt = page_cnt - cnt * extlen;
1003                 bzero(&memres, sizeof (memres));
1004                 /*LINTED: constant in conditional context*/
1005                 set_xen_guest_handle(memres.extent_start, mfns);
1006                 memres.domid = DOMID_SELF;
1007                 memres.nr_extents = fallback_cnt;
1008                 e = HYPERVISOR_memory_op(XENMEM_increase_reservation, &memres);
1009                 if (e != fallback_cnt) {
1010                         cmn_err(CE_PANIC, "balloon: unable to recover from "
1011                             "failed increase_reservation.\n");
1012                 }
1013                 locked = balloon_lock_contig_pfnlist(fallback_cnt);
1014                 for (i = 0; i < fallback_cnt; i++) {
1015                         uint_t offset = page_cnt - fallback_cnt;
1016 
1017                         /*
1018                          * We already used pp[0...(cnt * extlen)] before,
1019                          * so start at the next entry in the pp array.
1020                          */
1021                         reassign_pfn(pp[i + offset]->p_pagenum, mfns[i]);
1022                 }
1023                 if (locked)
1024                         unlock_contig_pfnlist();
1025         }
1026 
1027         /*
1028          * balloon_free_pages increments our counter.  Decrement it here.
1029          */
1030         atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, -(long)page_cnt);
1031 
1032         /*
1033          * return the number of extents we were able to replace. If we got
1034          * this far, we know all the pp's are valid.
1035          */
1036         return (cnt);
1037 }
1038 
1039 
1040 /*
1041  * Called from the driver - return the requested stat.
1042  */
1043 size_t
1044 balloon_values(int cmd)
1045 {
1046         switch (cmd) {
1047         case BLN_IOCTL_CURRENT:
1048                 return (ptokb(bln_stats.bln_current_pages));
1049         case BLN_IOCTL_TARGET:
1050                 return (ptokb(bln_stats.bln_new_target));
1051         case BLN_IOCTL_LOW:
1052                 return (ptokb(bln_stats.bln_low));
1053         case BLN_IOCTL_HIGH:
1054                 return (ptokb(bln_stats.bln_high));
1055         case BLN_IOCTL_LIMIT:
1056                 return (ptokb(bln_stats.bln_hard_limit));
1057         default:
1058                 panic("Unexpected cmd %d in balloon_values()\n", cmd);
1059         }
1060         /*NOTREACHED*/
1061 }