1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 #include <sys/types.h>
  27 #include <sys/param.h>
  28 #include <sys/systm.h>
  29 #include <sys/errno.h>
  30 #include <sys/kmem.h>
  31 #include <sys/vnode.h>
  32 #include <sys/vfs_opreg.h>
  33 #include <sys/swap.h>
  34 #include <sys/sysmacros.h>
  35 #include <sys/buf.h>
  36 #include <sys/callb.h>
  37 #include <sys/debug.h>
  38 #include <vm/seg.h>
  39 #include <sys/fs/swapnode.h>
  40 #include <fs/fs_subr.h>
  41 #include <sys/cmn_err.h>
  42 #include <sys/mem_config.h>
  43 #include <sys/atomic.h>
  44 
  45 extern const fs_operation_def_t swap_vnodeops_template[];
  46 
  47 /*
  48  * swapfs_minfree is the amount of physical memory (actually remaining
  49  * availrmem) that we want to keep free for the rest of the system.  This
  50  * means that swapfs can only grow to availrmem - swapfs_minfree.  This
  51  * can be set as just constant value or a certain percentage of installed
  52  * physical memory. It is set in swapinit().
  53  *
  54  * Users who want to change the amount of memory that can be used as swap
  55  * space should do so by setting swapfs_desfree at boot time,
  56  * not swapfs_minfree.
  57  */
  58 
  59 pgcnt_t swapfs_desfree = 0;
  60 pgcnt_t swapfs_minfree = 0;
  61 pgcnt_t swapfs_reserve = 0;
  62 
  63 #ifdef SWAPFS_DEBUG
  64 int swapfs_debug;
  65 #endif /* SWAPFS_DEBUG */
  66 
  67 
  68 static int swapfs_vpcount;
  69 static kmutex_t swapfs_lock;
  70 static struct async_reqs *sw_ar, *sw_pendlist, *sw_freelist;
  71 
  72 static struct vnode **swap_vnodes;      /* ptr's to swap vnodes */
  73 
  74 static void swap_init_mem_config(void);
  75 
  76 static pgcnt_t initial_swapfs_desfree;
  77 static pgcnt_t initial_swapfs_minfree;
  78 static pgcnt_t initial_swapfs_reserve;
  79 
  80 static int swap_sync(struct vfs *vfsp, short flag, struct cred *cr);
  81 
  82 static void
  83 swapfs_recalc_save_initial(void)
  84 {
  85         initial_swapfs_desfree = swapfs_desfree;
  86         initial_swapfs_minfree = swapfs_minfree;
  87         initial_swapfs_reserve = swapfs_reserve;
  88 }
  89 
  90 static int
  91 swapfs_recalc(pgcnt_t pgs)
  92 {
  93         pgcnt_t new_swapfs_desfree;
  94         pgcnt_t new_swapfs_minfree;
  95         pgcnt_t new_swapfs_reserve;
  96 
  97         new_swapfs_desfree = initial_swapfs_desfree;
  98         new_swapfs_minfree = initial_swapfs_minfree;
  99         new_swapfs_reserve = initial_swapfs_reserve;
 100 
 101         if (new_swapfs_desfree == 0)
 102                 new_swapfs_desfree = btopr(7 * 512 * 1024); /* 3-1/2Mb */;
 103 
 104         if (new_swapfs_minfree == 0) {
 105                 /*
 106                  * We set this lower than we'd like here, 2Mb, because we
 107                  * always boot on swapfs. It's up to a safer value,
 108                  * swapfs_desfree, when/if we add physical swap devices
 109                  * in swapadd(). Users who want to change the amount of
 110                  * memory that can be used as swap space should do so by
 111                  * setting swapfs_desfree at boot time, not swapfs_minfree.
 112                  * However, swapfs_minfree is tunable by install as a
 113                  * workaround for bugid 1147463.
 114                  */
 115                 new_swapfs_minfree = MAX(btopr(2 * 1024 * 1024), pgs >> 3);
 116         }
 117 
 118         /*
 119          * priv processes can reserve memory as swap as long as availrmem
 120          * remains greater than swapfs_minfree; in the case of non-priv
 121          * processes, memory can be reserved as swap only if availrmem
 122          * doesn't fall below (swapfs_minfree + swapfs_reserve). Thus,
 123          * swapfs_reserve amount of memswap is not available to non-priv
 124          * processes. This protects daemons such as automounter dying
 125          * as a result of application processes eating away almost entire
 126          * membased swap. This safeguard becomes useless if apps are run
 127          * with root access.
 128          *
 129          * set swapfs_reserve to a minimum of 4Mb or 1/128 of physmem whichever
 130          * is greater up to the limit of 128 MB.
 131          */
 132         if (new_swapfs_reserve == 0)
 133                 new_swapfs_reserve = MIN(btopr(128 * 1024 * 1024),
 134                     MAX(btopr(4 * 1024 * 1024), pgs >> 7));
 135 
 136         /* Test basic numeric viability. */
 137         if (new_swapfs_minfree > pgs)
 138                 return (0);
 139 
 140         /* Equivalent test to anon_resvmem() check. */
 141         if (availrmem < new_swapfs_minfree) {
 142                 /*
 143                  * If ism pages are being used, then there must be agreement
 144                  * between these two policies.
 145                  */
 146                 if ((availrmem > segspt_minfree) && (segspt_minfree > 0)) {
 147                         new_swapfs_minfree = segspt_minfree;
 148                 } else {
 149                         return (0);
 150                 }
 151         }
 152 
 153         swapfs_desfree = new_swapfs_desfree;
 154         swapfs_minfree = new_swapfs_minfree;
 155         swapfs_reserve = new_swapfs_reserve;
 156 
 157         return (1);
 158 }
 159 
 160 /*ARGSUSED1*/
 161 int
 162 swapinit(int fstype, char *name)
 163 {                                                       /* reserve for mp */
 164         ssize_t sw_freelist_size = klustsize / PAGESIZE * 2;
 165         int i, error;
 166 
 167         static const fs_operation_def_t swap_vfsops[] = {
 168                 VFSNAME_SYNC, { .vfs_sync = swap_sync },
 169                 NULL, NULL
 170         };
 171 
 172         SWAPFS_PRINT(SWAP_SUBR, "swapinit\n", 0, 0, 0, 0, 0);
 173         mutex_init(&swapfs_lock, NULL, MUTEX_DEFAULT, NULL);
 174 
 175         swap_vnodes = kmem_zalloc(MAX_SWAP_VNODES * sizeof (struct vnode *),
 176             KM_SLEEP);
 177 
 178         swapfs_recalc_save_initial();
 179         if (!swapfs_recalc(physmem))
 180                 cmn_err(CE_PANIC, "swapfs_minfree(%lu) > physmem(%lu)",
 181                     swapfs_minfree, physmem);
 182 
 183         /*
 184          * Arrange for a callback on memory size change.
 185          */
 186         swap_init_mem_config();
 187 
 188         sw_ar = (struct async_reqs *)
 189             kmem_zalloc(sw_freelist_size*sizeof (struct async_reqs), KM_SLEEP);
 190 
 191         error = vfs_setfsops(fstype, swap_vfsops, NULL);
 192         if (error != 0) {
 193                 cmn_err(CE_WARN, "swapinit: bad vfs ops template");
 194                 return (error);
 195         }
 196 
 197         error = vn_make_ops(name, swap_vnodeops_template, &swap_vnodeops);
 198         if (error != 0) {
 199                 (void) vfs_freevfsops_by_type(fstype);
 200                 cmn_err(CE_WARN, "swapinit: bad vnode ops template");
 201                 return (error);
 202         }
 203         sw_freelist = sw_ar;
 204         for (i = 0; i < sw_freelist_size - 1; i++)
 205                 sw_ar[i].a_next = &sw_ar[i + 1];
 206 
 207         return (0);
 208 }
 209 
 210 /*
 211  * Get a swapfs vnode corresponding to the specified identifier.
 212  */
 213 struct vnode *
 214 swapfs_getvp(ulong_t vidx)
 215 {
 216         struct vnode *vp;
 217 
 218         vp = swap_vnodes[vidx];
 219         if (vp) {
 220                 return (vp);
 221         }
 222 
 223         mutex_enter(&swapfs_lock);
 224         vp = swap_vnodes[vidx];
 225         if (vp == NULL) {
 226                 vp = vn_alloc(KM_SLEEP);
 227                 vn_setops(vp, swap_vnodeops);
 228                 vp->v_type = VREG;
 229                 vp->v_flag |= (VISSWAP|VISSWAPFS);
 230                 swap_vnodes[vidx] = vp;
 231                 swapfs_vpcount++;
 232         }
 233         mutex_exit(&swapfs_lock);
 234         return (vp);
 235 }
 236 
 237 int swap_lo;
 238 
 239 /*ARGSUSED*/
 240 static int
 241 swap_sync(struct vfs *vfsp, short flag, struct cred *cr)
 242 {
 243         struct vnode *vp;
 244         int i;
 245 
 246         if (!(flag & SYNC_ALL))
 247                 return (1);
 248 
 249         /*
 250          * assumes that we are the only one left to access this so that
 251          * no need to use swapfs_lock (since it's staticly defined)
 252          */
 253         for (i = 0; i < MAX_SWAP_VNODES; i++) {
 254                 vp = swap_vnodes[i];
 255                 if (vp) {
 256                         VN_HOLD(vp);
 257                         (void) VOP_PUTPAGE(vp, (offset_t)0, 0,
 258                             (B_ASYNC | B_FREE), kcred, NULL);
 259                         VN_RELE(vp);
 260                 }
 261         }
 262         return (0);
 263 }
 264 
 265 extern int sw_pending_size;
 266 
 267 /*
 268  * Take an async request off the pending queue
 269  */
 270 struct async_reqs *
 271 sw_getreq()
 272 {
 273         struct async_reqs *arg;
 274 
 275         mutex_enter(&swapfs_lock);
 276         arg = sw_pendlist;
 277         if (arg) {
 278                 sw_pendlist = arg->a_next;
 279                 arg->a_next = NULL;
 280                 sw_pending_size -= PAGESIZE;
 281         }
 282         ASSERT(sw_pending_size >= 0);
 283         mutex_exit(&swapfs_lock);
 284         return (arg);
 285 }
 286 
 287 /*
 288  * Put an async request on the pending queue
 289  */
 290 void
 291 sw_putreq(struct async_reqs *arg)
 292 {
 293         /* Hold onto it */
 294         VN_HOLD(arg->a_vp);
 295 
 296         mutex_enter(&swapfs_lock);
 297         arg->a_next = sw_pendlist;
 298         sw_pendlist = arg;
 299         sw_pending_size += PAGESIZE;
 300         mutex_exit(&swapfs_lock);
 301 }
 302 
 303 /*
 304  * Put an async request back on the pending queue
 305  */
 306 void
 307 sw_putbackreq(struct async_reqs *arg)
 308 {
 309         mutex_enter(&swapfs_lock);
 310         arg->a_next = sw_pendlist;
 311         sw_pendlist = arg;
 312         sw_pending_size += PAGESIZE;
 313         mutex_exit(&swapfs_lock);
 314 }
 315 
 316 /*
 317  * Take an async request structure off the free list
 318  */
 319 struct async_reqs *
 320 sw_getfree()
 321 {
 322         struct async_reqs *arg;
 323 
 324         mutex_enter(&swapfs_lock);
 325         arg = sw_freelist;
 326         if (arg) {
 327                 sw_freelist = arg->a_next;
 328                 arg->a_next = NULL;
 329         }
 330         mutex_exit(&swapfs_lock);
 331         return (arg);
 332 }
 333 
 334 /*
 335  * Put an async request structure on the free list
 336  */
 337 void
 338 sw_putfree(struct async_reqs *arg)
 339 {
 340         /* Release our hold - should have locked the page by now */
 341         VN_RELE(arg->a_vp);
 342 
 343         mutex_enter(&swapfs_lock);
 344         arg->a_next = sw_freelist;
 345         sw_freelist = arg;
 346         mutex_exit(&swapfs_lock);
 347 }
 348 
 349 static pgcnt_t swapfs_pending_delete;
 350 
 351 /*ARGSUSED*/
 352 static void
 353 swap_mem_config_post_add(
 354         void *arg,
 355         pgcnt_t delta_swaps)
 356 {
 357         (void) swapfs_recalc(physmem - swapfs_pending_delete);
 358 }
 359 
 360 /*ARGSUSED*/
 361 static int
 362 swap_mem_config_pre_del(
 363         void *arg,
 364         pgcnt_t delta_swaps)
 365 {
 366         pgcnt_t nv;
 367 
 368         nv = atomic_add_long_nv(&swapfs_pending_delete, (spgcnt_t)delta_swaps);
 369         if (!swapfs_recalc(physmem - nv)) {
 370                 /*
 371                  * Tidy-up is done by the call to post_del which
 372                  * is always made.
 373                  */
 374                 cmn_err(CE_NOTE, "Memory operation refused to ensure system "
 375                     "doesn't deadlock due to excessive consumption by swapfs.");
 376                 return (EBUSY);
 377         }
 378         return (0);
 379 }
 380 
 381 /*ARGSUSED*/
 382 static void
 383 swap_mem_config_post_del(
 384         void *arg,
 385         pgcnt_t delta_swaps,
 386         int cancelled)
 387 {
 388         pgcnt_t nv;
 389 
 390         nv = atomic_add_long_nv(&swapfs_pending_delete, -(spgcnt_t)delta_swaps);
 391         (void) swapfs_recalc(physmem - nv);
 392 }
 393 
 394 static kphysm_setup_vector_t swap_mem_config_vec = {
 395         KPHYSM_SETUP_VECTOR_VERSION,
 396         swap_mem_config_post_add,
 397         swap_mem_config_pre_del,
 398         swap_mem_config_post_del,
 399 };
 400 
 401 static void
 402 swap_init_mem_config(void)
 403 {
 404         int ret;
 405 
 406         ret = kphysm_setup_func_register(&swap_mem_config_vec, (void *)NULL);
 407         ASSERT(ret == 0);
 408 }