illumos-gate Wdiff usr/src/uts/common/vm/seg_spt.c

Print this page

12701 segspt_minfree needs right-sizing
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Patrick Mooney <pmooney@pfmooney.com>

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/vm/seg_spt.c
          +++ new/usr/src/uts/common/vm/seg_spt.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *

↓ open down ↓

12 lines elided

↑ open up ↑

  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 1993, 2010, Oracle and/or its affiliates. All rights reserved.
  23      - * Copyright 2018 Joyent, Inc.
       23 + * Copyright 2019 Joyent, Inc.
  24   24   * Copyright (c) 2016 by Delphix. All rights reserved.
  25   25   */
  26   26  
  27   27  #include <sys/param.h>
  28   28  #include <sys/user.h>
  29   29  #include <sys/mman.h>
  30   30  #include <sys/kmem.h>
  31   31  #include <sys/sysmacros.h>
  32   32  #include <sys/cmn_err.h>
  33   33  #include <sys/systm.h>

  34   34  #include <sys/tuneable.h>
  35   35  #include <vm/hat.h>
  36   36  #include <vm/seg.h>
  37   37  #include <vm/as.h>
  38   38  #include <vm/anon.h>
  39   39  #include <vm/page.h>
  40   40  #include <sys/buf.h>
  41   41  #include <sys/swap.h>
  42   42  #include <sys/atomic.h>
  43   43  #include <vm/seg_spt.h>
  44   44  #include <sys/debug.h>
  45   45  #include <sys/vtrace.h>
  46   46  #include <sys/shm.h>
  47   47  #include <sys/shm_impl.h>
  48   48  #include <sys/lgrp.h>
  49   49  #include <sys/vmsystm.h>
  50   50  #include <sys/policy.h>
  51   51  #include <sys/project.h>
  52   52  #include <sys/tnf_probe.h>

↓ open down ↓

19 lines elided

↑ open up ↑

  53   53  #include <sys/zone.h>
  54   54  
  55   55  #define SEGSPTADDR      (caddr_t)0x0
  56   56  
  57   57  /*
  58   58   * # pages used for spt
  59   59   */
  60   60  size_t  spt_used;
  61   61  
  62   62  /*
  63      - * segspt_minfree is the memory left for system after ISM
  64      - * locked its pages; it is set up to 5% of availrmem in
  65      - * sptcreate when ISM is created.  ISM should not use more
  66      - * than ~90% of availrmem; if it does, then the performance
  67      - * of the system may decrease. Machines with large memories may
  68      - * be able to use up more memory for ISM so we set the default
  69      - * segspt_minfree to 5% (which gives ISM max 95% of availrmem.
  70      - * If somebody wants even more memory for ISM (risking hanging
  71      - * the system) they can patch the segspt_minfree to smaller number.
       63 + * See spt_setminfree().
  72   64   */
  73   65  pgcnt_t segspt_minfree = 0;
       66 +size_t segspt_minfree_clamp = (1UL << 30); /* 1GB in bytes */
  74   67  
  75   68  static int segspt_create(struct seg **segpp, void *argsp);
  76   69  static int segspt_unmap(struct seg *seg, caddr_t raddr, size_t ssize);
  77   70  static void segspt_free(struct seg *seg);
  78   71  static void segspt_free_pages(struct seg *seg, caddr_t addr, size_t len);
  79   72  static lgrp_mem_policy_info_t *segspt_getpolicy(struct seg *seg, caddr_t addr);
  80   73  
  81   74  /* ARGSUSED */
  82   75  __NORETURN static int
  83   76  segspt_badop_dup(struct seg *seg __unused, struct seg *newseg __unused)

  84   77  {
  85   78          panic("%s called", __func__);
  86   79  }
  87   80  
  88   81  /* ARGSUSED */
  89   82  __NORETURN static faultcode_t
  90   83  segspt_badop_fault(struct hat *hat, struct seg *seg, caddr_t addr,
  91   84      size_t len, enum fault_type type, enum seg_rw rw)
  92   85  {
  93   86          panic("%s called", __func__);
  94   87  }
  95   88  
  96   89  /* ARGSUSED */
  97   90  __NORETURN static faultcode_t
  98   91  segspt_badop_faulta(struct seg *seg __unused, caddr_t addr __unused)
  99   92  {
 100   93          panic("%s called", __func__);
 101   94  }
 102   95  
 103   96  /* ARGSUSED */
 104   97  __NORETURN static int
 105   98  segspt_badop_prot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
 106   99  {
 107  100          panic("%s called", __func__);
 108  101  }
 109  102  
 110  103  /* ARGSUSED */
 111  104  __NORETURN static int
 112  105  segspt_badop_checkprot(struct seg *seg, caddr_t addr, size_t size, uint_t prot)
 113  106  {
 114  107          panic("%s called", __func__);
 115  108  }
 116  109  
 117  110  /* ARGSUSED */
 118  111  __NORETURN static int
 119  112  segspt_badop_kluster(struct seg *seg, caddr_t addr, ssize_t delta)
 120  113  {
 121  114          panic("%s called", __func__);
 122  115  }
 123  116  
 124  117  /* ARGSUSED */
 125  118  __NORETURN static size_t
 126  119  segspt_badop_swapout(struct seg *seg)
 127  120  {
 128  121          panic("%s called", __func__);
 129  122  }
 130  123  
 131  124  /* ARGSUSED */
 132  125  __NORETURN static int
 133  126  segspt_badop_sync(struct seg *seg, caddr_t addr, size_t len, int attr,
 134  127      uint_t flags)
 135  128  {
 136  129          panic("%s called", __func__);
 137  130  }
 138  131  
 139  132  /* ARGSUSED */
 140  133  __NORETURN
 141  134  static size_t
 142  135  segspt_badop_incore(struct seg *seg, caddr_t addr, size_t len, char *vec)
 143  136  {
 144  137          panic("%s called", __func__);
 145  138  }
 146  139  
 147  140  /* ARGSUSED */
 148  141  __NORETURN static int
 149  142  segspt_badop_lockop(struct seg *seg, caddr_t addr, size_t len, int attr,
 150  143      int op, ulong_t *lockmap, size_t pos)
 151  144  {
 152  145          panic("%s called", __func__);
 153  146  }
 154  147  
 155  148  /* ARGSUSED */
 156  149  __NORETURN static int
 157  150  segspt_badop_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv)
 158  151  {
 159  152          panic("%s called", __func__);
 160  153  }
 161  154  
 162  155  /* ARGSUSED */
 163  156  __NORETURN static u_offset_t
 164  157  segspt_badop_getoffset(struct seg *seg, caddr_t addr)
 165  158  {
 166  159          panic("%s called", __func__);
 167  160  }
 168  161  
 169  162  /* ARGSUSED */
 170  163  __NORETURN static int
 171  164  segspt_badop_gettype(struct seg *seg, caddr_t addr)
 172  165  {
 173  166          panic("%s called", __func__);
 174  167  }
 175  168  
 176  169  /* ARGSUSED */
 177  170  __NORETURN static int
 178  171  segspt_badop_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp)
 179  172  {
 180  173          panic("%s called", __func__);
 181  174  }
 182  175  
 183  176  /* ARGSUSED */
 184  177  __NORETURN static int
 185  178  segspt_badop_advise(struct seg *seg, caddr_t addr, size_t len, uint_t behav)
 186  179  {
 187  180          panic("%s called", __func__);
 188  181  }
 189  182  
 190  183  /* ARGSUSED */
 191  184  __NORETURN static void
 192  185  segspt_badop_dump(struct seg *seg)
 193  186  {
 194  187          panic("%s called", __func__);
 195  188  }
 196  189  
 197  190  /* ARGSUSED */
 198  191  __NORETURN static int
 199  192  segspt_badop_pagelock(struct seg *seg, caddr_t addr, size_t len,
 200  193      struct page ***ppp, enum lock_type type, enum seg_rw rw)
 201  194  {
 202  195          panic("%s called", __func__);
 203  196  }
 204  197  
 205  198  /* ARGSUSED */
 206  199  __NORETURN static int
 207  200  segspt_badop_setpgsz(struct seg *seg, caddr_t addr, size_t len, uint_t szc)
 208  201  {
 209  202          panic("%s called", __func__);
 210  203  }
 211  204  
 212  205  /* ARGSUSED */
 213  206  __NORETURN static int
 214  207  segspt_badop_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp)
 215  208  {
 216  209          panic("%s called", __func__);
 217  210  }
 218  211  
 219  212  /* ARGSUSED */
 220  213  __NORETURN static int
 221  214  segspt_badop_capable(struct seg *seg, segcapability_t capability)
 222  215  {
 223  216          panic("%s called", __func__);
 224  217  }
 225  218  
 226  219  struct seg_ops segspt_ops = {
 227  220          segspt_badop_dup,               /* dup */
 228  221          segspt_unmap,
 229  222          segspt_free,
 230  223          segspt_badop_fault,             /* fault */
 231  224          segspt_badop_faulta,            /* faulta */
 232  225          segspt_badop_prot,              /* setprot */
 233  226          segspt_badop_checkprot,         /* checkprot */
 234  227          segspt_badop_kluster,           /* kluster */
 235  228          segspt_badop_swapout,           /* swapout */
 236  229          segspt_badop_sync,              /* sync */
 237  230          segspt_badop_incore,            /* incore */
 238  231          segspt_badop_lockop,            /* lockop */
 239  232          segspt_badop_getprot,           /* getprot */
 240  233          segspt_badop_getoffset,         /* getoffset */
 241  234          segspt_badop_gettype,           /* gettype */
 242  235          segspt_badop_getvp,             /* getvp */
 243  236          segspt_badop_advise,            /* advise */
 244  237          segspt_badop_dump,              /* dump */
 245  238          segspt_badop_pagelock,          /* pagelock */
 246  239          segspt_badop_setpgsz,           /* setpgsz */
 247  240          segspt_badop_getmemid,          /* getmemid */
 248  241          segspt_getpolicy,               /* getpolicy */
 249  242          segspt_badop_capable,           /* capable */
 250  243          seg_inherit_notsup              /* inherit */
 251  244  };
 252  245  
 253  246  static int segspt_shmdup(struct seg *seg, struct seg *newseg);
 254  247  static int segspt_shmunmap(struct seg *seg, caddr_t raddr, size_t ssize);
 255  248  static void segspt_shmfree(struct seg *seg);
 256  249  static faultcode_t segspt_shmfault(struct hat *hat, struct seg *seg,
 257  250                  caddr_t addr, size_t len, enum fault_type type, enum seg_rw rw);
 258  251  static faultcode_t segspt_shmfaulta(struct seg *seg, caddr_t addr);
 259  252  static int segspt_shmsetprot(struct seg *seg, caddr_t addr, size_t len,
 260  253                  uint_t prot);
 261  254  static int segspt_shmcheckprot(struct seg *seg, caddr_t addr, size_t size,
 262  255                  uint_t prot);
 263  256  static int      segspt_shmkluster(struct seg *seg, caddr_t addr, ssize_t delta);
 264  257  static size_t   segspt_shmswapout(struct seg *seg);
 265  258  static size_t segspt_shmincore(struct seg *seg, caddr_t addr, size_t len,
 266  259                  char *vec);
 267  260  static int segspt_shmsync(struct seg *seg, caddr_t addr, size_t len,
 268  261                  int attr, uint_t flags);
 269  262  static int segspt_shmlockop(struct seg *seg, caddr_t addr, size_t len,
 270  263                  int attr, int op, ulong_t *lockmap, size_t pos);
 271  264  static int segspt_shmgetprot(struct seg *seg, caddr_t addr, size_t len,
 272  265                  uint_t *protv);
 273  266  static u_offset_t segspt_shmgetoffset(struct seg *seg, caddr_t addr);
 274  267  static int segspt_shmgettype(struct seg *seg, caddr_t addr);
 275  268  static int segspt_shmgetvp(struct seg *seg, caddr_t addr, struct vnode **vpp);
 276  269  static int segspt_shmadvise(struct seg *seg, caddr_t addr, size_t len,
 277  270                  uint_t behav);
 278  271  static void segspt_shmdump(struct seg *seg);
 279  272  static int segspt_shmpagelock(struct seg *, caddr_t, size_t,
 280  273                  struct page ***, enum lock_type, enum seg_rw);
 281  274  static int segspt_shmsetpgsz(struct seg *, caddr_t, size_t, uint_t);
 282  275  static int segspt_shmgetmemid(struct seg *, caddr_t, memid_t *);
 283  276  static lgrp_mem_policy_info_t *segspt_shmgetpolicy(struct seg *, caddr_t);
 284  277  static int segspt_shmcapable(struct seg *, segcapability_t);
 285  278  
 286  279  struct seg_ops segspt_shmops = {
 287  280          segspt_shmdup,
 288  281          segspt_shmunmap,
 289  282          segspt_shmfree,
 290  283          segspt_shmfault,
 291  284          segspt_shmfaulta,
 292  285          segspt_shmsetprot,
 293  286          segspt_shmcheckprot,
 294  287          segspt_shmkluster,
 295  288          segspt_shmswapout,
 296  289          segspt_shmsync,
 297  290          segspt_shmincore,
 298  291          segspt_shmlockop,
 299  292          segspt_shmgetprot,
 300  293          segspt_shmgetoffset,
 301  294          segspt_shmgettype,
 302  295          segspt_shmgetvp,
 303  296          segspt_shmadvise,       /* advise */
 304  297          segspt_shmdump,
 305  298          segspt_shmpagelock,
 306  299          segspt_shmsetpgsz,
 307  300          segspt_shmgetmemid,
 308  301          segspt_shmgetpolicy,

↓ open down ↓

225 lines elided

↑ open up ↑

 309  302          segspt_shmcapable,
 310  303          seg_inherit_notsup
 311  304  };
 312  305  
 313  306  static void segspt_purge(struct seg *seg);
 314  307  static int segspt_reclaim(void *, caddr_t, size_t, struct page **,
 315  308                  enum seg_rw, int);
 316  309  static int spt_anon_getpages(struct seg *seg, caddr_t addr, size_t len,
 317  310                  page_t **ppa);
 318  311  
      312 +/*
      313 + * This value corresponds to headroom in availrmem that ISM can never allocate
      314 + * (but others can).  The original intent here was to prevent ISM from locking
      315 + * all of the remaining availrmem into memory, making forward progress
      316 + * difficult. It's not clear how much this matters on modern systems.
      317 + *
      318 + * The traditional default value of 5% of total memory is used, except on
      319 + * systems where that quickly gets ridiculous: in that case we clamp at a rather
      320 + * arbitrary value of 1GB.
      321 + *
      322 + * Note that since this is called lazily on the first sptcreate(), in theory,
      323 + * this could represent a very small value if the system is heavily loaded
      324 + * already. In practice, the first ISM user is pretty likely to come along
      325 + * earlier during the system's operation.
      326 + *
      327 + * This never gets re-figured.
      328 + */
      329 +static void
      330 +spt_setminfree(void)
      331 +{
      332 +        segspt_minfree = availrmem / 20;
 319  333  
      334 +        if (segspt_minfree_clamp != 0 &&
      335 +            segspt_minfree > (segspt_minfree_clamp / PAGESIZE))
      336 +                segspt_minfree = segspt_minfree_clamp / PAGESIZE;
      337 +}
 320  338  
 321      -/*ARGSUSED*/
 322  339  int
 323  340  sptcreate(size_t size, struct seg **sptseg, struct anon_map *amp,
 324  341      uint_t prot, uint_t flags, uint_t share_szc)
 325  342  {
 326  343          int     err;
 327  344          struct  as      *newas;
 328  345          struct  segspt_crargs sptcargs;
 329  346  
 330  347  #ifdef DEBUG
 331  348          TNF_PROBE_1(sptcreate, "spt", /* CSTYLED */,
 332  349                          tnf_ulong, size, size );
 333  350  #endif
 334      -        if (segspt_minfree == 0)        /* leave min 5% of availrmem for */
 335      -                segspt_minfree = availrmem/20;  /* for the system */
      351 +        if (segspt_minfree == 0)
      352 +                spt_setminfree();
 336  353  
 337  354          if (!hat_supported(HAT_SHARED_PT, (void *)0))
 338  355                  return (EINVAL);
 339  356  
 340  357          /*
 341  358           * get a new as for this shared memory segment
 342  359           */
 343  360          newas = as_alloc();
 344  361          newas->a_proc = NULL;
 345  362          sptcargs.amp = amp;

 346  363          sptcargs.prot = prot;
 347  364          sptcargs.flags = flags;
 348  365          sptcargs.szc = share_szc;
 349  366          /*
 350  367           * create a shared page table (spt) segment
 351  368           */
 352  369  
 353  370          if (err = as_map(newas, SEGSPTADDR, size, segspt_create, &sptcargs)) {
 354  371                  as_free(newas);
 355  372                  return (err);
 356  373          }
 357  374          *sptseg = sptcargs.seg_spt;
 358  375          return (0);
 359  376  }
 360  377  
 361  378  void
 362  379  sptdestroy(struct as *as, struct anon_map *amp)
 363  380  {
 364  381  
 365  382  #ifdef DEBUG
 366  383          TNF_PROBE_0(sptdestroy, "spt", /* CSTYLED */);
 367  384  #endif
 368  385          (void) as_unmap(as, SEGSPTADDR, amp->size);
 369  386          as_free(as);
 370  387  }
 371  388  
 372  389  /*
 373  390   * called from seg_free().
 374  391   * free (i.e., unlock, unmap, return to free list)
 375  392   *  all the pages in the given seg.
 376  393   */
 377  394  void
 378  395  segspt_free(struct seg  *seg)
 379  396  {
 380  397          struct spt_data *sptd = (struct spt_data *)seg->s_data;
 381  398  
 382  399          ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
 383  400  
 384  401          if (sptd != NULL) {
 385  402                  if (sptd->spt_realsize)
 386  403                          segspt_free_pages(seg, seg->s_base, sptd->spt_realsize);
 387  404  
 388  405                  if (sptd->spt_ppa_lckcnt) {
 389  406                          kmem_free(sptd->spt_ppa_lckcnt,
 390  407                              sizeof (*sptd->spt_ppa_lckcnt)
 391  408                              * btopr(sptd->spt_amp->size));
 392  409                  }
 393  410                  kmem_free(sptd->spt_vp, sizeof (*sptd->spt_vp));
 394  411                  cv_destroy(&sptd->spt_cv);
 395  412                  mutex_destroy(&sptd->spt_lock);
 396  413                  kmem_free(sptd, sizeof (*sptd));
 397  414          }
 398  415  }
 399  416  
 400  417  /*ARGSUSED*/
 401  418  static int
 402  419  segspt_shmsync(struct seg *seg, caddr_t addr, size_t len, int attr,
 403  420      uint_t flags)
 404  421  {
 405  422          ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
 406  423  
 407  424          return (0);
 408  425  }
 409  426  
 410  427  /*ARGSUSED*/
 411  428  static size_t
 412  429  segspt_shmincore(struct seg *seg, caddr_t addr, size_t len, char *vec)
 413  430  {
 414  431          caddr_t eo_seg;
 415  432          pgcnt_t npages;
 416  433          struct shm_data *shmd = (struct shm_data *)seg->s_data;
 417  434          struct seg      *sptseg;
 418  435          struct spt_data *sptd;
 419  436  
 420  437          ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
 421  438  #ifdef lint
 422  439          seg = seg;
 423  440  #endif
 424  441          sptseg = shmd->shm_sptseg;
 425  442          sptd = sptseg->s_data;
 426  443  
 427  444          if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
 428  445                  eo_seg = addr + len;
 429  446                  while (addr < eo_seg) {
 430  447                          /* page exists, and it's locked. */
 431  448                          *vec++ = SEG_PAGE_INCORE | SEG_PAGE_LOCKED |
 432  449                              SEG_PAGE_ANON;
 433  450                          addr += PAGESIZE;
 434  451                  }
 435  452                  return (len);
 436  453          } else {
 437  454                  struct  anon_map *amp = shmd->shm_amp;
 438  455                  struct  anon    *ap;
 439  456                  page_t          *pp;
 440  457                  pgcnt_t         anon_index;
 441  458                  struct vnode    *vp;
 442  459                  u_offset_t      off;
 443  460                  ulong_t         i;
 444  461                  int             ret;
 445  462                  anon_sync_obj_t cookie;
 446  463  
 447  464                  addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
 448  465                  anon_index = seg_page(seg, addr);
 449  466                  npages = btopr(len);
 450  467                  if (anon_index + npages > btopr(shmd->shm_amp->size)) {
 451  468                          return (EINVAL);
 452  469                  }
 453  470                  ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
 454  471                  for (i = 0; i < npages; i++, anon_index++) {
 455  472                          ret = 0;
 456  473                          anon_array_enter(amp, anon_index, &cookie);
 457  474                          ap = anon_get_ptr(amp->ahp, anon_index);
 458  475                          if (ap != NULL) {
 459  476                                  swap_xlate(ap, &vp, &off);
 460  477                                  anon_array_exit(&cookie);
 461  478                                  pp = page_lookup_nowait(vp, off, SE_SHARED);
 462  479                                  if (pp != NULL) {
 463  480                                          ret |= SEG_PAGE_INCORE | SEG_PAGE_ANON;
 464  481                                          page_unlock(pp);
 465  482                                  }
 466  483                          } else {
 467  484                                  anon_array_exit(&cookie);
 468  485                          }
 469  486                          if (shmd->shm_vpage[anon_index] & DISM_PG_LOCKED) {
 470  487                                  ret |= SEG_PAGE_LOCKED;
 471  488                          }
 472  489                          *vec++ = (char)ret;
 473  490                  }
 474  491                  ANON_LOCK_EXIT(&amp->a_rwlock);
 475  492                  return (len);
 476  493          }
 477  494  }
 478  495  
 479  496  static int
 480  497  segspt_unmap(struct seg *seg, caddr_t raddr, size_t ssize)
 481  498  {
 482  499          size_t share_size;
 483  500  
 484  501          ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
 485  502  
 486  503          /*
 487  504           * seg.s_size may have been rounded up to the largest page size
 488  505           * in shmat().
 489  506           * XXX This should be cleanedup. sptdestroy should take a length
 490  507           * argument which should be the same as sptcreate. Then
 491  508           * this rounding would not be needed (or is done in shm.c)
 492  509           * Only the check for full segment will be needed.
 493  510           *
 494  511           * XXX -- shouldn't raddr == 0 always? These tests don't seem
 495  512           * to be useful at all.
 496  513           */
 497  514          share_size = page_get_pagesize(seg->s_szc);
 498  515          ssize = P2ROUNDUP(ssize, share_size);
 499  516  
 500  517          if (raddr == seg->s_base && ssize == seg->s_size) {
 501  518                  seg_free(seg);
 502  519                  return (0);
 503  520          } else
 504  521                  return (EINVAL);
 505  522  }
 506  523  
 507  524  int
 508  525  segspt_create(struct seg **segpp, void *argsp)
 509  526  {
 510  527          struct seg      *seg = *segpp;
 511  528          int             err;
 512  529          caddr_t         addr = seg->s_base;
 513  530          struct spt_data *sptd;
 514  531          struct segspt_crargs *sptcargs = (struct segspt_crargs *)argsp;
 515  532          struct anon_map *amp = sptcargs->amp;
 516  533          struct kshmid   *sp = amp->a_sp;
 517  534          struct  cred    *cred = CRED();
 518  535          ulong_t         i, j, anon_index = 0;
 519  536          pgcnt_t         npages = btopr(amp->size);
 520  537          struct vnode    *vp;
 521  538          page_t          **ppa;
 522  539          uint_t          hat_flags;
 523  540          size_t          pgsz;
 524  541          pgcnt_t         pgcnt;
 525  542          caddr_t         a;
 526  543          pgcnt_t         pidx;
 527  544          size_t          sz;
 528  545          proc_t          *procp = curproc;
 529  546          rctl_qty_t      lockedbytes = 0;
 530  547          kproject_t      *proj;
 531  548  
 532  549          /*
 533  550           * We are holding the a_lock on the underlying dummy as,
 534  551           * so we can make calls to the HAT layer.
 535  552           */
 536  553          ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
 537  554          ASSERT(sp != NULL);
 538  555  
 539  556  #ifdef DEBUG
 540  557          TNF_PROBE_2(segspt_create, "spt", /* CSTYLED */,
 541  558              tnf_opaque, addr, addr, tnf_ulong, len, seg->s_size);
 542  559  #endif
 543  560          if ((sptcargs->flags & SHM_PAGEABLE) == 0) {
 544  561                  if (err = anon_swap_adjust(npages))
 545  562                          return (err);
 546  563          }
 547  564          err = ENOMEM;
 548  565  
 549  566          if ((sptd = kmem_zalloc(sizeof (*sptd), KM_NOSLEEP)) == NULL)
 550  567                  goto out1;
 551  568  
 552  569          ppa = NULL;
 553  570          if ((sptcargs->flags & SHM_PAGEABLE) == 0) {
 554  571                  if ((ppa = kmem_zalloc(((sizeof (page_t *)) * npages),
 555  572                      KM_NOSLEEP)) == NULL)
 556  573                          goto out2;
 557  574          }
 558  575  
 559  576          mutex_init(&sptd->spt_lock, NULL, MUTEX_DEFAULT, NULL);
 560  577  
 561  578          if ((vp = kmem_zalloc(sizeof (*vp), KM_NOSLEEP)) == NULL)
 562  579                  goto out3;
 563  580  
 564  581          seg->s_ops = &segspt_ops;
 565  582          sptd->spt_vp = vp;
 566  583          sptd->spt_amp = amp;
 567  584          sptd->spt_prot = sptcargs->prot;
 568  585          sptd->spt_flags = sptcargs->flags;
 569  586          seg->s_data = (caddr_t)sptd;
 570  587          sptd->spt_ppa = NULL;
 571  588          sptd->spt_ppa_lckcnt = NULL;
 572  589          seg->s_szc = sptcargs->szc;
 573  590          cv_init(&sptd->spt_cv, NULL, CV_DEFAULT, NULL);
 574  591          sptd->spt_gen = 0;
 575  592  
 576  593          ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
 577  594          if (seg->s_szc > amp->a_szc) {
 578  595                  amp->a_szc = seg->s_szc;
 579  596          }
 580  597          ANON_LOCK_EXIT(&amp->a_rwlock);
 581  598  
 582  599          /*
 583  600           * Set policy to affect initial allocation of pages in
 584  601           * anon_map_createpages()
 585  602           */
 586  603          (void) lgrp_shm_policy_set(LGRP_MEM_POLICY_DEFAULT, amp, anon_index,
 587  604              NULL, 0, ptob(npages));
 588  605  
 589  606          if (sptcargs->flags & SHM_PAGEABLE) {
 590  607                  size_t  share_sz;
 591  608                  pgcnt_t new_npgs, more_pgs;
 592  609                  struct anon_hdr *nahp;
 593  610                  zone_t *zone;
 594  611  
 595  612                  share_sz = page_get_pagesize(seg->s_szc);
 596  613                  if (!IS_P2ALIGNED(amp->size, share_sz)) {
 597  614                          /*
 598  615                           * We are rounding up the size of the anon array
 599  616                           * on 4 M boundary because we always create 4 M
 600  617                           * of page(s) when locking, faulting pages and we
 601  618                           * don't have to check for all corner cases e.g.
 602  619                           * if there is enough space to allocate 4 M
 603  620                           * page.
 604  621                           */
 605  622                          new_npgs = btop(P2ROUNDUP(amp->size, share_sz));
 606  623                          more_pgs = new_npgs - npages;
 607  624  
 608  625                          /*
 609  626                           * The zone will never be NULL, as a fully created
 610  627                           * shm always has an owning zone.
 611  628                           */
 612  629                          zone = sp->shm_perm.ipc_zone_ref.zref_zone;
 613  630                          ASSERT(zone != NULL);
 614  631                          if (anon_resv_zone(ptob(more_pgs), zone) == 0) {
 615  632                                  err = ENOMEM;
 616  633                                  goto out4;
 617  634                          }
 618  635  
 619  636                          nahp = anon_create(new_npgs, ANON_SLEEP);
 620  637                          ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
 621  638                          (void) anon_copy_ptr(amp->ahp, 0, nahp, 0, npages,
 622  639                              ANON_SLEEP);
 623  640                          anon_release(amp->ahp, npages);
 624  641                          amp->ahp = nahp;
 625  642                          ASSERT(amp->swresv == ptob(npages));
 626  643                          amp->swresv = amp->size = ptob(new_npgs);
 627  644                          ANON_LOCK_EXIT(&amp->a_rwlock);
 628  645                          npages = new_npgs;
 629  646                  }
 630  647  
 631  648                  sptd->spt_ppa_lckcnt = kmem_zalloc(npages *
 632  649                      sizeof (*sptd->spt_ppa_lckcnt), KM_SLEEP);
 633  650                  sptd->spt_pcachecnt = 0;
 634  651                  sptd->spt_realsize = ptob(npages);
 635  652                  sptcargs->seg_spt = seg;
 636  653                  return (0);
 637  654          }
 638  655  
 639  656          /*
 640  657           * get array of pages for each anon slot in amp
 641  658           */
 642  659          if ((err = anon_map_createpages(amp, anon_index, ptob(npages), ppa,
 643  660              seg, addr, S_CREATE, cred)) != 0)
 644  661                  goto out4;
 645  662  
 646  663          mutex_enter(&sp->shm_mlock);
 647  664  
 648  665          /* May be partially locked, so, count bytes to charge for locking */
 649  666          for (i = 0; i < npages; i++)
 650  667                  if (ppa[i]->p_lckcnt == 0)
 651  668                          lockedbytes += PAGESIZE;
 652  669  
 653  670          proj = sp->shm_perm.ipc_proj;
 654  671  
 655  672          if (lockedbytes > 0) {
 656  673                  mutex_enter(&procp->p_lock);
 657  674                  if (rctl_incr_locked_mem(procp, proj, lockedbytes, 0)) {
 658  675                          mutex_exit(&procp->p_lock);
 659  676                          mutex_exit(&sp->shm_mlock);
 660  677                          for (i = 0; i < npages; i++)
 661  678                                  page_unlock(ppa[i]);
 662  679                          err = ENOMEM;
 663  680                          goto out4;
 664  681                  }
 665  682                  mutex_exit(&procp->p_lock);
 666  683          }
 667  684  
 668  685          /*
 669  686           * addr is initial address corresponding to the first page on ppa list
 670  687           */
 671  688          for (i = 0; i < npages; i++) {
 672  689                  /* attempt to lock all pages */
 673  690                  if (page_pp_lock(ppa[i], 0, 1) == 0) {
 674  691                          /*
 675  692                           * if unable to lock any page, unlock all
 676  693                           * of them and return error
 677  694                           */
 678  695                          for (j = 0; j < i; j++)
 679  696                                  page_pp_unlock(ppa[j], 0, 1);
 680  697                          for (i = 0; i < npages; i++)
 681  698                                  page_unlock(ppa[i]);
 682  699                          rctl_decr_locked_mem(NULL, proj, lockedbytes, 0);
 683  700                          mutex_exit(&sp->shm_mlock);
 684  701                          err = ENOMEM;
 685  702                          goto out4;
 686  703                  }
 687  704          }
 688  705          mutex_exit(&sp->shm_mlock);
 689  706  
 690  707          /*
 691  708           * Some platforms assume that ISM mappings are HAT_LOAD_LOCK
 692  709           * for the entire life of the segment. For example platforms
 693  710           * that do not support Dynamic Reconfiguration.
 694  711           */
 695  712          hat_flags = HAT_LOAD_SHARE;
 696  713          if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, NULL))
 697  714                  hat_flags |= HAT_LOAD_LOCK;
 698  715  
 699  716          /*
 700  717           * Load translations one lare page at a time
 701  718           * to make sure we don't create mappings bigger than
 702  719           * segment's size code in case underlying pages
 703  720           * are shared with segvn's segment that uses bigger
 704  721           * size code than we do.
 705  722           */
 706  723          pgsz = page_get_pagesize(seg->s_szc);
 707  724          pgcnt = page_get_pagecnt(seg->s_szc);
 708  725          for (a = addr, pidx = 0; pidx < npages; a += pgsz, pidx += pgcnt) {
 709  726                  sz = MIN(pgsz, ptob(npages - pidx));
 710  727                  hat_memload_array(seg->s_as->a_hat, a, sz,
 711  728                      &ppa[pidx], sptd->spt_prot, hat_flags);
 712  729          }
 713  730  
 714  731          /*
 715  732           * On platforms that do not support HAT_DYNAMIC_ISM_UNMAP,
 716  733           * we will leave the pages locked SE_SHARED for the life
 717  734           * of the ISM segment. This will prevent any calls to
 718  735           * hat_pageunload() on this ISM segment for those platforms.
 719  736           */
 720  737          if (!(hat_flags & HAT_LOAD_LOCK)) {
 721  738                  /*
 722  739                   * On platforms that support HAT_DYNAMIC_ISM_UNMAP,
 723  740                   * we no longer need to hold the SE_SHARED lock on the pages,
 724  741                   * since L_PAGELOCK and F_SOFTLOCK calls will grab the
 725  742                   * SE_SHARED lock on the pages as necessary.
 726  743                   */
 727  744                  for (i = 0; i < npages; i++)
 728  745                          page_unlock(ppa[i]);
 729  746          }
 730  747          sptd->spt_pcachecnt = 0;
 731  748          kmem_free(ppa, ((sizeof (page_t *)) * npages));
 732  749          sptd->spt_realsize = ptob(npages);
 733  750          atomic_add_long(&spt_used, npages);
 734  751          sptcargs->seg_spt = seg;
 735  752          return (0);
 736  753  
 737  754  out4:
 738  755          seg->s_data = NULL;
 739  756          kmem_free(vp, sizeof (*vp));
 740  757          cv_destroy(&sptd->spt_cv);
 741  758  out3:
 742  759          mutex_destroy(&sptd->spt_lock);
 743  760          if ((sptcargs->flags & SHM_PAGEABLE) == 0)
 744  761                  kmem_free(ppa, (sizeof (*ppa) * npages));
 745  762  out2:
 746  763          kmem_free(sptd, sizeof (*sptd));
 747  764  out1:
 748  765          if ((sptcargs->flags & SHM_PAGEABLE) == 0)
 749  766                  anon_swap_restore(npages);
 750  767          return (err);
 751  768  }
 752  769  
 753  770  /*ARGSUSED*/
 754  771  void
 755  772  segspt_free_pages(struct seg *seg, caddr_t addr, size_t len)
 756  773  {
 757  774          struct page     *pp;
 758  775          struct spt_data *sptd = (struct spt_data *)seg->s_data;
 759  776          pgcnt_t         npages;
 760  777          ulong_t         anon_idx;
 761  778          struct anon_map *amp;
 762  779          struct anon     *ap;
 763  780          struct vnode    *vp;
 764  781          u_offset_t      off;
 765  782          uint_t          hat_flags;
 766  783          int             root = 0;
 767  784          pgcnt_t         pgs, curnpgs = 0;
 768  785          page_t          *rootpp;
 769  786          rctl_qty_t      unlocked_bytes = 0;
 770  787          kproject_t      *proj;
 771  788          kshmid_t        *sp;
 772  789  
 773  790          ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
 774  791  
 775  792          len = P2ROUNDUP(len, PAGESIZE);
 776  793  
 777  794          npages = btop(len);
 778  795  
 779  796          hat_flags = HAT_UNLOAD_UNLOCK | HAT_UNLOAD_UNMAP;
 780  797          if ((hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) ||
 781  798              (sptd->spt_flags & SHM_PAGEABLE)) {
 782  799                  hat_flags = HAT_UNLOAD_UNMAP;
 783  800          }
 784  801  
 785  802          hat_unload(seg->s_as->a_hat, addr, len, hat_flags);
 786  803  
 787  804          amp = sptd->spt_amp;
 788  805          if (sptd->spt_flags & SHM_PAGEABLE)
 789  806                  npages = btop(amp->size);
 790  807  
 791  808          ASSERT(amp != NULL);
 792  809  
 793  810          proj = NULL;
 794  811          rootpp = NULL;
 795  812          sp = NULL;
 796  813          if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
 797  814                  sp = amp->a_sp;
 798  815                  proj = sp->shm_perm.ipc_proj;
 799  816                  mutex_enter(&sp->shm_mlock);
 800  817          }
 801  818          for (anon_idx = 0; anon_idx < npages; anon_idx++) {
 802  819                  if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
 803  820                          if ((ap = anon_get_ptr(amp->ahp, anon_idx)) == NULL) {
 804  821                                  panic("segspt_free_pages: null app");
 805  822                                  /*NOTREACHED*/
 806  823                          }
 807  824                  } else {
 808  825                          if ((ap = anon_get_next_ptr(amp->ahp, &anon_idx))
 809  826                              == NULL)
 810  827                                  continue;
 811  828                  }
 812  829                  ASSERT(ANON_ISBUSY(anon_get_slot(amp->ahp, anon_idx)) == 0);
 813  830                  swap_xlate(ap, &vp, &off);
 814  831  
 815  832                  /*
 816  833                   * If this platform supports HAT_DYNAMIC_ISM_UNMAP,
 817  834                   * the pages won't be having SE_SHARED lock at this
 818  835                   * point.
 819  836                   *
 820  837                   * On platforms that do not support HAT_DYNAMIC_ISM_UNMAP,
 821  838                   * the pages are still held SE_SHARED locked from the
 822  839                   * original segspt_create()
 823  840                   *
 824  841                   * Our goal is to get SE_EXCL lock on each page, remove
 825  842                   * permanent lock on it and invalidate the page.
 826  843                   */
 827  844                  if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
 828  845                          if (hat_flags == HAT_UNLOAD_UNMAP)
 829  846                                  pp = page_lookup(vp, off, SE_EXCL);
 830  847                          else {
 831  848                                  if ((pp = page_find(vp, off)) == NULL) {
 832  849                                          panic("segspt_free_pages: "
 833  850                                              "page not locked");
 834  851                                          /*NOTREACHED*/
 835  852                                  }
 836  853                                  if (!page_tryupgrade(pp)) {
 837  854                                          page_unlock(pp);
 838  855                                          pp = page_lookup(vp, off, SE_EXCL);
 839  856                                  }
 840  857                          }
 841  858                          if (pp == NULL) {
 842  859                                  panic("segspt_free_pages: "
 843  860                                      "page not in the system");
 844  861                                  /*NOTREACHED*/
 845  862                          }
 846  863                          ASSERT(pp->p_lckcnt > 0);
 847  864                          page_pp_unlock(pp, 0, 1);
 848  865                          if (pp->p_lckcnt == 0)
 849  866                                  unlocked_bytes += PAGESIZE;
 850  867                  } else {
 851  868                          if ((pp = page_lookup(vp, off, SE_EXCL)) == NULL)
 852  869                                  continue;
 853  870                  }
 854  871                  /*
 855  872                   * It's logical to invalidate the pages here as in most cases
 856  873                   * these were created by segspt.
 857  874                   */
 858  875                  if (pp->p_szc != 0) {
 859  876                          if (root == 0) {
 860  877                                  ASSERT(curnpgs == 0);
 861  878                                  root = 1;
 862  879                                  rootpp = pp;
 863  880                                  pgs = curnpgs = page_get_pagecnt(pp->p_szc);
 864  881                                  ASSERT(pgs > 1);
 865  882                                  ASSERT(IS_P2ALIGNED(pgs, pgs));
 866  883                                  ASSERT(!(page_pptonum(pp) & (pgs - 1)));
 867  884                                  curnpgs--;
 868  885                          } else if ((page_pptonum(pp) & (pgs - 1)) == pgs - 1) {
 869  886                                  ASSERT(curnpgs == 1);
 870  887                                  ASSERT(page_pptonum(pp) ==
 871  888                                      page_pptonum(rootpp) + (pgs - 1));
 872  889                                  page_destroy_pages(rootpp);
 873  890                                  root = 0;
 874  891                                  curnpgs = 0;
 875  892                          } else {
 876  893                                  ASSERT(curnpgs > 1);
 877  894                                  ASSERT(page_pptonum(pp) ==
 878  895                                      page_pptonum(rootpp) + (pgs - curnpgs));
 879  896                                  curnpgs--;
 880  897                          }
 881  898                  } else {
 882  899                          if (root != 0 || curnpgs != 0) {
 883  900                                  panic("segspt_free_pages: bad large page");
 884  901                                  /*NOTREACHED*/
 885  902                          }
 886  903                          /*
 887  904                           * Before destroying the pages, we need to take care
 888  905                           * of the rctl locked memory accounting. For that
 889  906                           * we need to calculte the unlocked_bytes.
 890  907                           */
 891  908                          if (pp->p_lckcnt > 0)
 892  909                                  unlocked_bytes += PAGESIZE;
 893  910                          /*LINTED: constant in conditional context */
 894  911                          VN_DISPOSE(pp, B_INVAL, 0, kcred);
 895  912                  }
 896  913          }
 897  914          if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
 898  915                  if (unlocked_bytes > 0)
 899  916                          rctl_decr_locked_mem(NULL, proj, unlocked_bytes, 0);
 900  917                  mutex_exit(&sp->shm_mlock);
 901  918          }
 902  919          if (root != 0 || curnpgs != 0) {
 903  920                  panic("segspt_free_pages: bad large page");
 904  921                  /*NOTREACHED*/
 905  922          }
 906  923  
 907  924          /*
 908  925           * mark that pages have been released
 909  926           */
 910  927          sptd->spt_realsize = 0;
 911  928  
 912  929          if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
 913  930                  atomic_add_long(&spt_used, -npages);
 914  931                  anon_swap_restore(npages);
 915  932          }
 916  933  }
 917  934  
 918  935  /*
 919  936   * Get memory allocation policy info for specified address in given segment
 920  937   */
 921  938  static lgrp_mem_policy_info_t *
 922  939  segspt_getpolicy(struct seg *seg, caddr_t addr)
 923  940  {
 924  941          struct anon_map         *amp;
 925  942          ulong_t                 anon_index;
 926  943          lgrp_mem_policy_info_t  *policy_info;
 927  944          struct spt_data         *spt_data;
 928  945  
 929  946          ASSERT(seg != NULL);
 930  947  
 931  948          /*
 932  949           * Get anon_map from segspt
 933  950           *
 934  951           * Assume that no lock needs to be held on anon_map, since
 935  952           * it should be protected by its reference count which must be
 936  953           * nonzero for an existing segment
 937  954           * Need to grab readers lock on policy tree though
 938  955           */
 939  956          spt_data = (struct spt_data *)seg->s_data;
 940  957          if (spt_data == NULL)
 941  958                  return (NULL);
 942  959          amp = spt_data->spt_amp;
 943  960          ASSERT(amp->refcnt != 0);
 944  961  
 945  962          /*
 946  963           * Get policy info
 947  964           *
 948  965           * Assume starting anon index of 0
 949  966           */
 950  967          anon_index = seg_page(seg, addr);
 951  968          policy_info = lgrp_shm_policy_get(amp, anon_index, NULL, 0);
 952  969  
 953  970          return (policy_info);
 954  971  }
 955  972  
 956  973  /*
 957  974   * DISM only.
 958  975   * Return locked pages over a given range.
 959  976   *
 960  977   * We will cache all DISM locked pages and save the pplist for the
 961  978   * entire segment in the ppa field of the underlying DISM segment structure.
 962  979   * Later, during a call to segspt_reclaim() we will use this ppa array
 963  980   * to page_unlock() all of the pages and then we will free this ppa list.
 964  981   */
 965  982  /*ARGSUSED*/
 966  983  static int
 967  984  segspt_dismpagelock(struct seg *seg, caddr_t addr, size_t len,
 968  985      struct page ***ppp, enum lock_type type, enum seg_rw rw)
 969  986  {
 970  987          struct  shm_data *shmd = (struct shm_data *)seg->s_data;
 971  988          struct  seg     *sptseg = shmd->shm_sptseg;
 972  989          struct  spt_data *sptd = sptseg->s_data;
 973  990          pgcnt_t pg_idx, npages, tot_npages, npgs;
 974  991          struct  page **pplist, **pl, **ppa, *pp;
 975  992          struct  anon_map *amp;
 976  993          spgcnt_t        an_idx;
 977  994          int     ret = ENOTSUP;
 978  995          uint_t  pl_built = 0;
 979  996          struct  anon *ap;
 980  997          struct  vnode *vp;
 981  998          u_offset_t off;
 982  999          pgcnt_t claim_availrmem = 0;
 983 1000          uint_t  szc;
 984 1001  
 985 1002          ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
 986 1003          ASSERT(type == L_PAGELOCK || type == L_PAGEUNLOCK);
 987 1004  
 988 1005          /*
 989 1006           * We want to lock/unlock the entire ISM segment. Therefore,
 990 1007           * we will be using the underlying sptseg and it's base address
 991 1008           * and length for the caching arguments.
 992 1009           */
 993 1010          ASSERT(sptseg);
 994 1011          ASSERT(sptd);
 995 1012  
 996 1013          pg_idx = seg_page(seg, addr);
 997 1014          npages = btopr(len);
 998 1015  
 999 1016          /*
1000 1017           * check if the request is larger than number of pages covered
1001 1018           * by amp
1002 1019           */
1003 1020          if (pg_idx + npages > btopr(sptd->spt_amp->size)) {
1004 1021                  *ppp = NULL;
1005 1022                  return (ENOTSUP);
1006 1023          }
1007 1024  
1008 1025          if (type == L_PAGEUNLOCK) {
1009 1026                  ASSERT(sptd->spt_ppa != NULL);
1010 1027  
1011 1028                  seg_pinactive(seg, NULL, seg->s_base, sptd->spt_amp->size,
1012 1029                      sptd->spt_ppa, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);
1013 1030  
1014 1031                  /*
1015 1032                   * If someone is blocked while unmapping, we purge
1016 1033                   * segment page cache and thus reclaim pplist synchronously
1017 1034                   * without waiting for seg_pasync_thread. This speeds up
1018 1035                   * unmapping in cases where munmap(2) is called, while
1019 1036                   * raw async i/o is still in progress or where a thread
1020 1037                   * exits on data fault in a multithreaded application.
1021 1038                   */
1022 1039                  if ((sptd->spt_flags & DISM_PPA_CHANGED) ||
1023 1040                      (AS_ISUNMAPWAIT(seg->s_as) &&
1024 1041                      shmd->shm_softlockcnt > 0)) {
1025 1042                          segspt_purge(seg);
1026 1043                  }
1027 1044                  return (0);
1028 1045          }
1029 1046  
1030 1047          /* The L_PAGELOCK case ... */
1031 1048  
1032 1049          if (sptd->spt_flags & DISM_PPA_CHANGED) {
1033 1050                  segspt_purge(seg);
1034 1051                  /*
1035 1052                   * for DISM ppa needs to be rebuild since
1036 1053                   * number of locked pages could be changed
1037 1054                   */
1038 1055                  *ppp = NULL;
1039 1056                  return (ENOTSUP);
1040 1057          }
1041 1058  
1042 1059          /*
1043 1060           * First try to find pages in segment page cache, without
1044 1061           * holding the segment lock.
1045 1062           */
1046 1063          pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size,
1047 1064              S_WRITE, SEGP_FORCE_WIRED);
1048 1065          if (pplist != NULL) {
1049 1066                  ASSERT(sptd->spt_ppa != NULL);
1050 1067                  ASSERT(sptd->spt_ppa == pplist);
1051 1068                  ppa = sptd->spt_ppa;
1052 1069                  for (an_idx = pg_idx; an_idx < pg_idx + npages; ) {
1053 1070                          if (ppa[an_idx] == NULL) {
1054 1071                                  seg_pinactive(seg, NULL, seg->s_base,
1055 1072                                      sptd->spt_amp->size, ppa,
1056 1073                                      S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);
1057 1074                                  *ppp = NULL;
1058 1075                                  return (ENOTSUP);
1059 1076                          }
1060 1077                          if ((szc = ppa[an_idx]->p_szc) != 0) {
1061 1078                                  npgs = page_get_pagecnt(szc);
1062 1079                                  an_idx = P2ROUNDUP(an_idx + 1, npgs);
1063 1080                          } else {
1064 1081                                  an_idx++;
1065 1082                          }
1066 1083                  }
1067 1084                  /*
1068 1085                   * Since we cache the entire DISM segment, we want to
1069 1086                   * set ppp to point to the first slot that corresponds
1070 1087                   * to the requested addr, i.e. pg_idx.
1071 1088                   */
1072 1089                  *ppp = &(sptd->spt_ppa[pg_idx]);
1073 1090                  return (0);
1074 1091          }
1075 1092  
1076 1093          mutex_enter(&sptd->spt_lock);
1077 1094          /*
1078 1095           * try to find pages in segment page cache with mutex
1079 1096           */
1080 1097          pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size,
1081 1098              S_WRITE, SEGP_FORCE_WIRED);
1082 1099          if (pplist != NULL) {
1083 1100                  ASSERT(sptd->spt_ppa != NULL);
1084 1101                  ASSERT(sptd->spt_ppa == pplist);
1085 1102                  ppa = sptd->spt_ppa;
1086 1103                  for (an_idx = pg_idx; an_idx < pg_idx + npages; ) {
1087 1104                          if (ppa[an_idx] == NULL) {
1088 1105                                  mutex_exit(&sptd->spt_lock);
1089 1106                                  seg_pinactive(seg, NULL, seg->s_base,
1090 1107                                      sptd->spt_amp->size, ppa,
1091 1108                                      S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);
1092 1109                                  *ppp = NULL;
1093 1110                                  return (ENOTSUP);
1094 1111                          }
1095 1112                          if ((szc = ppa[an_idx]->p_szc) != 0) {
1096 1113                                  npgs = page_get_pagecnt(szc);
1097 1114                                  an_idx = P2ROUNDUP(an_idx + 1, npgs);
1098 1115                          } else {
1099 1116                                  an_idx++;
1100 1117                          }
1101 1118                  }
1102 1119                  /*
1103 1120                   * Since we cache the entire DISM segment, we want to
1104 1121                   * set ppp to point to the first slot that corresponds
1105 1122                   * to the requested addr, i.e. pg_idx.
1106 1123                   */
1107 1124                  mutex_exit(&sptd->spt_lock);
1108 1125                  *ppp = &(sptd->spt_ppa[pg_idx]);
1109 1126                  return (0);
1110 1127          }
1111 1128          if (seg_pinsert_check(seg, NULL, seg->s_base, sptd->spt_amp->size,
1112 1129              SEGP_FORCE_WIRED) == SEGP_FAIL) {
1113 1130                  mutex_exit(&sptd->spt_lock);
1114 1131                  *ppp = NULL;
1115 1132                  return (ENOTSUP);
1116 1133          }
1117 1134  
1118 1135          /*
1119 1136           * No need to worry about protections because DISM pages are always rw.
1120 1137           */
1121 1138          pl = pplist = NULL;
1122 1139          amp = sptd->spt_amp;
1123 1140  
1124 1141          /*
1125 1142           * Do we need to build the ppa array?
1126 1143           */
1127 1144          if (sptd->spt_ppa == NULL) {
1128 1145                  pgcnt_t lpg_cnt = 0;
1129 1146  
1130 1147                  pl_built = 1;
1131 1148                  tot_npages = btopr(sptd->spt_amp->size);
1132 1149  
1133 1150                  ASSERT(sptd->spt_pcachecnt == 0);
1134 1151                  pplist = kmem_zalloc(sizeof (page_t *) * tot_npages, KM_SLEEP);
1135 1152                  pl = pplist;
1136 1153  
1137 1154                  ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
1138 1155                  for (an_idx = 0; an_idx < tot_npages; ) {
1139 1156                          ap = anon_get_ptr(amp->ahp, an_idx);
1140 1157                          /*
1141 1158                           * Cache only mlocked pages. For large pages
1142 1159                           * if one (constituent) page is mlocked
1143 1160                           * all pages for that large page
1144 1161                           * are cached also. This is for quick
1145 1162                           * lookups of ppa array;
1146 1163                           */
1147 1164                          if ((ap != NULL) && (lpg_cnt != 0 ||
1148 1165                              (sptd->spt_ppa_lckcnt[an_idx] != 0))) {
1149 1166  
1150 1167                                  swap_xlate(ap, &vp, &off);
1151 1168                                  pp = page_lookup(vp, off, SE_SHARED);
1152 1169                                  ASSERT(pp != NULL);
1153 1170                                  if (lpg_cnt == 0) {
1154 1171                                          lpg_cnt++;
1155 1172                                          /*
1156 1173                                           * For a small page, we are done --
1157 1174                                           * lpg_count is reset to 0 below.
1158 1175                                           *
1159 1176                                           * For a large page, we are guaranteed
1160 1177                                           * to find the anon structures of all
1161 1178                                           * constituent pages and a non-zero
1162 1179                                           * lpg_cnt ensures that we don't test
1163 1180                                           * for mlock for these. We are done
1164 1181                                           * when lpg_count reaches (npgs + 1).
1165 1182                                           * If we are not the first constituent
1166 1183                                           * page, restart at the first one.
1167 1184                                           */
1168 1185                                          npgs = page_get_pagecnt(pp->p_szc);
1169 1186                                          if (!IS_P2ALIGNED(an_idx, npgs)) {
1170 1187                                                  an_idx = P2ALIGN(an_idx, npgs);
1171 1188                                                  page_unlock(pp);
1172 1189                                                  continue;
1173 1190                                          }
1174 1191                                  }
1175 1192                                  if (++lpg_cnt > npgs)
1176 1193                                          lpg_cnt = 0;
1177 1194  
1178 1195                                  /*
1179 1196                                   * availrmem is decremented only
1180 1197                                   * for unlocked pages
1181 1198                                   */
1182 1199                                  if (sptd->spt_ppa_lckcnt[an_idx] == 0)
1183 1200                                          claim_availrmem++;
1184 1201                                  pplist[an_idx] = pp;
1185 1202                          }
1186 1203                          an_idx++;
1187 1204                  }
1188 1205                  ANON_LOCK_EXIT(&amp->a_rwlock);
1189 1206  
1190 1207                  if (claim_availrmem) {
1191 1208                          mutex_enter(&freemem_lock);
1192 1209                          if (availrmem < tune.t_minarmem + claim_availrmem) {
1193 1210                                  mutex_exit(&freemem_lock);
1194 1211                                  ret = ENOTSUP;
1195 1212                                  claim_availrmem = 0;
1196 1213                                  goto insert_fail;
1197 1214                          } else {
1198 1215                                  availrmem -= claim_availrmem;
1199 1216                          }
1200 1217                          mutex_exit(&freemem_lock);
1201 1218                  }
1202 1219  
1203 1220                  sptd->spt_ppa = pl;
1204 1221          } else {
1205 1222                  /*
1206 1223                   * We already have a valid ppa[].
1207 1224                   */
1208 1225                  pl = sptd->spt_ppa;
1209 1226          }
1210 1227  
1211 1228          ASSERT(pl != NULL);
1212 1229  
1213 1230          ret = seg_pinsert(seg, NULL, seg->s_base, sptd->spt_amp->size,
1214 1231              sptd->spt_amp->size, pl, S_WRITE, SEGP_FORCE_WIRED,
1215 1232              segspt_reclaim);
1216 1233          if (ret == SEGP_FAIL) {
1217 1234                  /*
1218 1235                   * seg_pinsert failed. We return
1219 1236                   * ENOTSUP, so that the as_pagelock() code will
1220 1237                   * then try the slower F_SOFTLOCK path.
1221 1238                   */
1222 1239                  if (pl_built) {
1223 1240                          /*
1224 1241                           * No one else has referenced the ppa[].
1225 1242                           * We created it and we need to destroy it.
1226 1243                           */
1227 1244                          sptd->spt_ppa = NULL;
1228 1245                  }
1229 1246                  ret = ENOTSUP;
1230 1247                  goto insert_fail;
1231 1248          }
1232 1249  
1233 1250          /*
1234 1251           * In either case, we increment softlockcnt on the 'real' segment.
1235 1252           */
1236 1253          sptd->spt_pcachecnt++;
1237 1254          atomic_inc_ulong((ulong_t *)(&(shmd->shm_softlockcnt)));
1238 1255  
1239 1256          ppa = sptd->spt_ppa;
1240 1257          for (an_idx = pg_idx; an_idx < pg_idx + npages; ) {
1241 1258                  if (ppa[an_idx] == NULL) {
1242 1259                          mutex_exit(&sptd->spt_lock);
1243 1260                          seg_pinactive(seg, NULL, seg->s_base,
1244 1261                              sptd->spt_amp->size,
1245 1262                              pl, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);
1246 1263                          *ppp = NULL;
1247 1264                          return (ENOTSUP);
1248 1265                  }
1249 1266                  if ((szc = ppa[an_idx]->p_szc) != 0) {
1250 1267                          npgs = page_get_pagecnt(szc);
1251 1268                          an_idx = P2ROUNDUP(an_idx + 1, npgs);
1252 1269                  } else {
1253 1270                          an_idx++;
1254 1271                  }
1255 1272          }
1256 1273          /*
1257 1274           * We can now drop the sptd->spt_lock since the ppa[]
1258 1275           * exists and we have incremented pacachecnt.
1259 1276           */
1260 1277          mutex_exit(&sptd->spt_lock);
1261 1278  
1262 1279          /*
1263 1280           * Since we cache the entire segment, we want to
1264 1281           * set ppp to point to the first slot that corresponds
1265 1282           * to the requested addr, i.e. pg_idx.
1266 1283           */
1267 1284          *ppp = &(sptd->spt_ppa[pg_idx]);
1268 1285          return (0);
1269 1286  
1270 1287  insert_fail:
1271 1288          /*
1272 1289           * We will only reach this code if we tried and failed.
1273 1290           *
1274 1291           * And we can drop the lock on the dummy seg, once we've failed
1275 1292           * to set up a new ppa[].
1276 1293           */
1277 1294          mutex_exit(&sptd->spt_lock);
1278 1295  
1279 1296          if (pl_built) {
1280 1297                  if (claim_availrmem) {
1281 1298                          mutex_enter(&freemem_lock);
1282 1299                          availrmem += claim_availrmem;
1283 1300                          mutex_exit(&freemem_lock);
1284 1301                  }
1285 1302  
1286 1303                  /*
1287 1304                   * We created pl and we need to destroy it.
1288 1305                   */
1289 1306                  pplist = pl;
1290 1307                  for (an_idx = 0; an_idx < tot_npages; an_idx++) {
1291 1308                          if (pplist[an_idx] != NULL)
1292 1309                                  page_unlock(pplist[an_idx]);
1293 1310                  }
1294 1311                  kmem_free(pl, sizeof (page_t *) * tot_npages);
1295 1312          }
1296 1313  
1297 1314          if (shmd->shm_softlockcnt <= 0) {
1298 1315                  if (AS_ISUNMAPWAIT(seg->s_as)) {
1299 1316                          mutex_enter(&seg->s_as->a_contents);
1300 1317                          if (AS_ISUNMAPWAIT(seg->s_as)) {
1301 1318                                  AS_CLRUNMAPWAIT(seg->s_as);
1302 1319                                  cv_broadcast(&seg->s_as->a_cv);
1303 1320                          }
1304 1321                          mutex_exit(&seg->s_as->a_contents);
1305 1322                  }
1306 1323          }
1307 1324          *ppp = NULL;
1308 1325          return (ret);
1309 1326  }
1310 1327  
1311 1328  
1312 1329  
1313 1330  /*
1314 1331   * return locked pages over a given range.
1315 1332   *
1316 1333   * We will cache the entire ISM segment and save the pplist for the
1317 1334   * entire segment in the ppa field of the underlying ISM segment structure.
1318 1335   * Later, during a call to segspt_reclaim() we will use this ppa array
1319 1336   * to page_unlock() all of the pages and then we will free this ppa list.
1320 1337   */
1321 1338  /*ARGSUSED*/
1322 1339  static int
1323 1340  segspt_shmpagelock(struct seg *seg, caddr_t addr, size_t len,
1324 1341      struct page ***ppp, enum lock_type type, enum seg_rw rw)
1325 1342  {
1326 1343          struct shm_data *shmd = (struct shm_data *)seg->s_data;
1327 1344          struct seg      *sptseg = shmd->shm_sptseg;
1328 1345          struct spt_data *sptd = sptseg->s_data;
1329 1346          pgcnt_t np, page_index, npages;
1330 1347          caddr_t a, spt_base;
1331 1348          struct page **pplist, **pl, *pp;
1332 1349          struct anon_map *amp;
1333 1350          ulong_t anon_index;
1334 1351          int ret = ENOTSUP;
1335 1352          uint_t  pl_built = 0;
1336 1353          struct anon *ap;
1337 1354          struct vnode *vp;
1338 1355          u_offset_t off;
1339 1356  
1340 1357          ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
1341 1358          ASSERT(type == L_PAGELOCK || type == L_PAGEUNLOCK);
1342 1359  
1343 1360  
1344 1361          /*
1345 1362           * We want to lock/unlock the entire ISM segment. Therefore,
1346 1363           * we will be using the underlying sptseg and it's base address
1347 1364           * and length for the caching arguments.
1348 1365           */
1349 1366          ASSERT(sptseg);
1350 1367          ASSERT(sptd);
1351 1368  
1352 1369          if (sptd->spt_flags & SHM_PAGEABLE) {
1353 1370                  return (segspt_dismpagelock(seg, addr, len, ppp, type, rw));
1354 1371          }
1355 1372  
1356 1373          page_index = seg_page(seg, addr);
1357 1374          npages = btopr(len);
1358 1375  
1359 1376          /*
1360 1377           * check if the request is larger than number of pages covered
1361 1378           * by amp
1362 1379           */
1363 1380          if (page_index + npages > btopr(sptd->spt_amp->size)) {
1364 1381                  *ppp = NULL;
1365 1382                  return (ENOTSUP);
1366 1383          }
1367 1384  
1368 1385          if (type == L_PAGEUNLOCK) {
1369 1386  
1370 1387                  ASSERT(sptd->spt_ppa != NULL);
1371 1388  
1372 1389                  seg_pinactive(seg, NULL, seg->s_base, sptd->spt_amp->size,
1373 1390                      sptd->spt_ppa, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);
1374 1391  
1375 1392                  /*
1376 1393                   * If someone is blocked while unmapping, we purge
1377 1394                   * segment page cache and thus reclaim pplist synchronously
1378 1395                   * without waiting for seg_pasync_thread. This speeds up
1379 1396                   * unmapping in cases where munmap(2) is called, while
1380 1397                   * raw async i/o is still in progress or where a thread
1381 1398                   * exits on data fault in a multithreaded application.
1382 1399                   */
1383 1400                  if (AS_ISUNMAPWAIT(seg->s_as) && (shmd->shm_softlockcnt > 0)) {
1384 1401                          segspt_purge(seg);
1385 1402                  }
1386 1403                  return (0);
1387 1404          }
1388 1405  
1389 1406          /* The L_PAGELOCK case... */
1390 1407  
1391 1408          /*
1392 1409           * First try to find pages in segment page cache, without
1393 1410           * holding the segment lock.
1394 1411           */
1395 1412          pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size,
1396 1413              S_WRITE, SEGP_FORCE_WIRED);
1397 1414          if (pplist != NULL) {
1398 1415                  ASSERT(sptd->spt_ppa == pplist);
1399 1416                  ASSERT(sptd->spt_ppa[page_index]);
1400 1417                  /*
1401 1418                   * Since we cache the entire ISM segment, we want to
1402 1419                   * set ppp to point to the first slot that corresponds
1403 1420                   * to the requested addr, i.e. page_index.
1404 1421                   */
1405 1422                  *ppp = &(sptd->spt_ppa[page_index]);
1406 1423                  return (0);
1407 1424          }
1408 1425  
1409 1426          mutex_enter(&sptd->spt_lock);
1410 1427  
1411 1428          /*
1412 1429           * try to find pages in segment page cache
1413 1430           */
1414 1431          pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size,
1415 1432              S_WRITE, SEGP_FORCE_WIRED);
1416 1433          if (pplist != NULL) {
1417 1434                  ASSERT(sptd->spt_ppa == pplist);
1418 1435                  /*
1419 1436                   * Since we cache the entire segment, we want to
1420 1437                   * set ppp to point to the first slot that corresponds
1421 1438                   * to the requested addr, i.e. page_index.
1422 1439                   */
1423 1440                  mutex_exit(&sptd->spt_lock);
1424 1441                  *ppp = &(sptd->spt_ppa[page_index]);
1425 1442                  return (0);
1426 1443          }
1427 1444  
1428 1445          if (seg_pinsert_check(seg, NULL, seg->s_base, sptd->spt_amp->size,
1429 1446              SEGP_FORCE_WIRED) == SEGP_FAIL) {
1430 1447                  mutex_exit(&sptd->spt_lock);
1431 1448                  *ppp = NULL;
1432 1449                  return (ENOTSUP);
1433 1450          }
1434 1451  
1435 1452          /*
1436 1453           * No need to worry about protections because ISM pages
1437 1454           * are always rw.
1438 1455           */
1439 1456          pl = pplist = NULL;
1440 1457  
1441 1458          /*
1442 1459           * Do we need to build the ppa array?
1443 1460           */
1444 1461          if (sptd->spt_ppa == NULL) {
1445 1462                  ASSERT(sptd->spt_ppa == pplist);
1446 1463  
1447 1464                  spt_base = sptseg->s_base;
1448 1465                  pl_built = 1;
1449 1466  
1450 1467                  /*
1451 1468                   * availrmem is decremented once during anon_swap_adjust()
1452 1469                   * and is incremented during the anon_unresv(), which is
1453 1470                   * called from shm_rm_amp() when the segment is destroyed.
1454 1471                   */
1455 1472                  amp = sptd->spt_amp;
1456 1473                  ASSERT(amp != NULL);
1457 1474  
1458 1475                  /* pcachecnt is protected by sptd->spt_lock */
1459 1476                  ASSERT(sptd->spt_pcachecnt == 0);
1460 1477                  pplist = kmem_zalloc(sizeof (page_t *)
1461 1478                      * btopr(sptd->spt_amp->size), KM_SLEEP);
1462 1479                  pl = pplist;
1463 1480  
1464 1481                  anon_index = seg_page(sptseg, spt_base);
1465 1482  
1466 1483                  ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
1467 1484                  for (a = spt_base; a < (spt_base + sptd->spt_amp->size);
1468 1485                      a += PAGESIZE, anon_index++, pplist++) {
1469 1486                          ap = anon_get_ptr(amp->ahp, anon_index);
1470 1487                          ASSERT(ap != NULL);
1471 1488                          swap_xlate(ap, &vp, &off);
1472 1489                          pp = page_lookup(vp, off, SE_SHARED);
1473 1490                          ASSERT(pp != NULL);
1474 1491                          *pplist = pp;
1475 1492                  }
1476 1493                  ANON_LOCK_EXIT(&amp->a_rwlock);
1477 1494  
1478 1495                  if (a < (spt_base + sptd->spt_amp->size)) {
1479 1496                          ret = ENOTSUP;
1480 1497                          goto insert_fail;
1481 1498                  }
1482 1499                  sptd->spt_ppa = pl;
1483 1500          } else {
1484 1501                  /*
1485 1502                   * We already have a valid ppa[].
1486 1503                   */
1487 1504                  pl = sptd->spt_ppa;
1488 1505          }
1489 1506  
1490 1507          ASSERT(pl != NULL);
1491 1508  
1492 1509          ret = seg_pinsert(seg, NULL, seg->s_base, sptd->spt_amp->size,
1493 1510              sptd->spt_amp->size, pl, S_WRITE, SEGP_FORCE_WIRED,
1494 1511              segspt_reclaim);
1495 1512          if (ret == SEGP_FAIL) {
1496 1513                  /*
1497 1514                   * seg_pinsert failed. We return
1498 1515                   * ENOTSUP, so that the as_pagelock() code will
1499 1516                   * then try the slower F_SOFTLOCK path.
1500 1517                   */
1501 1518                  if (pl_built) {
1502 1519                          /*
1503 1520                           * No one else has referenced the ppa[].
1504 1521                           * We created it and we need to destroy it.
1505 1522                           */
1506 1523                          sptd->spt_ppa = NULL;
1507 1524                  }
1508 1525                  ret = ENOTSUP;
1509 1526                  goto insert_fail;
1510 1527          }
1511 1528  
1512 1529          /*
1513 1530           * In either case, we increment softlockcnt on the 'real' segment.
1514 1531           */
1515 1532          sptd->spt_pcachecnt++;
1516 1533          atomic_inc_ulong((ulong_t *)(&(shmd->shm_softlockcnt)));
1517 1534  
1518 1535          /*
1519 1536           * We can now drop the sptd->spt_lock since the ppa[]
1520 1537           * exists and we have incremented pacachecnt.
1521 1538           */
1522 1539          mutex_exit(&sptd->spt_lock);
1523 1540  
1524 1541          /*
1525 1542           * Since we cache the entire segment, we want to
1526 1543           * set ppp to point to the first slot that corresponds
1527 1544           * to the requested addr, i.e. page_index.
1528 1545           */
1529 1546          *ppp = &(sptd->spt_ppa[page_index]);
1530 1547          return (0);
1531 1548  
1532 1549  insert_fail:
1533 1550          /*
1534 1551           * We will only reach this code if we tried and failed.
1535 1552           *
1536 1553           * And we can drop the lock on the dummy seg, once we've failed
1537 1554           * to set up a new ppa[].
1538 1555           */
1539 1556          mutex_exit(&sptd->spt_lock);
1540 1557  
1541 1558          if (pl_built) {
1542 1559                  /*
1543 1560                   * We created pl and we need to destroy it.
1544 1561                   */
1545 1562                  pplist = pl;
1546 1563                  np = (((uintptr_t)(a - spt_base)) >> PAGESHIFT);
1547 1564                  while (np) {
1548 1565                          page_unlock(*pplist);
1549 1566                          np--;
1550 1567                          pplist++;
1551 1568                  }
1552 1569                  kmem_free(pl, sizeof (page_t *) * btopr(sptd->spt_amp->size));
1553 1570          }
1554 1571          if (shmd->shm_softlockcnt <= 0) {
1555 1572                  if (AS_ISUNMAPWAIT(seg->s_as)) {
1556 1573                          mutex_enter(&seg->s_as->a_contents);
1557 1574                          if (AS_ISUNMAPWAIT(seg->s_as)) {
1558 1575                                  AS_CLRUNMAPWAIT(seg->s_as);
1559 1576                                  cv_broadcast(&seg->s_as->a_cv);
1560 1577                          }
1561 1578                          mutex_exit(&seg->s_as->a_contents);
1562 1579                  }
1563 1580          }
1564 1581          *ppp = NULL;
1565 1582          return (ret);
1566 1583  }
1567 1584  
1568 1585  /*
1569 1586   * purge any cached pages in the I/O page cache
1570 1587   */
1571 1588  static void
1572 1589  segspt_purge(struct seg *seg)
1573 1590  {
1574 1591          seg_ppurge(seg, NULL, SEGP_FORCE_WIRED);
1575 1592  }
1576 1593  
1577 1594  static int
1578 1595  segspt_reclaim(void *ptag, caddr_t addr, size_t len, struct page **pplist,
1579 1596      enum seg_rw rw, int async)
1580 1597  {
1581 1598          struct seg *seg = (struct seg *)ptag;
1582 1599          struct  shm_data *shmd = (struct shm_data *)seg->s_data;
1583 1600          struct  seg     *sptseg;
1584 1601          struct  spt_data *sptd;
1585 1602          pgcnt_t npages, i, free_availrmem = 0;
1586 1603          int     done = 0;
1587 1604  
1588 1605  #ifdef lint
1589 1606          addr = addr;
1590 1607  #endif
1591 1608          sptseg = shmd->shm_sptseg;
1592 1609          sptd = sptseg->s_data;
1593 1610          npages = (len >> PAGESHIFT);
1594 1611          ASSERT(npages);
1595 1612          ASSERT(sptd->spt_pcachecnt != 0);
1596 1613          ASSERT(sptd->spt_ppa == pplist);
1597 1614          ASSERT(npages == btopr(sptd->spt_amp->size));
1598 1615          ASSERT(async || AS_LOCK_HELD(seg->s_as));
1599 1616  
1600 1617          /*
1601 1618           * Acquire the lock on the dummy seg and destroy the
1602 1619           * ppa array IF this is the last pcachecnt.
1603 1620           */
1604 1621          mutex_enter(&sptd->spt_lock);
1605 1622          if (--sptd->spt_pcachecnt == 0) {
1606 1623                  for (i = 0; i < npages; i++) {
1607 1624                          if (pplist[i] == NULL) {
1608 1625                                  continue;
1609 1626                          }
1610 1627                          if (rw == S_WRITE) {
1611 1628                                  hat_setrefmod(pplist[i]);
1612 1629                          } else {
1613 1630                                  hat_setref(pplist[i]);
1614 1631                          }
1615 1632                          if ((sptd->spt_flags & SHM_PAGEABLE) &&
1616 1633                              (sptd->spt_ppa_lckcnt[i] == 0))
1617 1634                                  free_availrmem++;
1618 1635                          page_unlock(pplist[i]);
1619 1636                  }
1620 1637                  if ((sptd->spt_flags & SHM_PAGEABLE) && free_availrmem) {
1621 1638                          mutex_enter(&freemem_lock);
1622 1639                          availrmem += free_availrmem;
1623 1640                          mutex_exit(&freemem_lock);
1624 1641                  }
1625 1642                  /*
1626 1643                   * Since we want to cach/uncache the entire ISM segment,
1627 1644                   * we will track the pplist in a segspt specific field
1628 1645                   * ppa, that is initialized at the time we add an entry to
1629 1646                   * the cache.
1630 1647                   */
1631 1648                  ASSERT(sptd->spt_pcachecnt == 0);
1632 1649                  kmem_free(pplist, sizeof (page_t *) * npages);
1633 1650                  sptd->spt_ppa = NULL;
1634 1651                  sptd->spt_flags &= ~DISM_PPA_CHANGED;
1635 1652                  sptd->spt_gen++;
1636 1653                  cv_broadcast(&sptd->spt_cv);
1637 1654                  done = 1;
1638 1655          }
1639 1656          mutex_exit(&sptd->spt_lock);
1640 1657  
1641 1658          /*
1642 1659           * If we are pcache async thread or called via seg_ppurge_wiredpp() we
1643 1660           * may not hold AS lock (in this case async argument is not 0). This
1644 1661           * means if softlockcnt drops to 0 after the decrement below address
1645 1662           * space may get freed. We can't allow it since after softlock
1646 1663           * derement to 0 we still need to access as structure for possible
1647 1664           * wakeup of unmap waiters. To prevent the disappearance of as we take
1648 1665           * this segment's shm_segfree_syncmtx. segspt_shmfree() also takes
1649 1666           * this mutex as a barrier to make sure this routine completes before
1650 1667           * segment is freed.
1651 1668           *
1652 1669           * The second complication we have to deal with in async case is a
1653 1670           * possibility of missed wake up of unmap wait thread. When we don't
1654 1671           * hold as lock here we may take a_contents lock before unmap wait
1655 1672           * thread that was first to see softlockcnt was still not 0. As a
1656 1673           * result we'll fail to wake up an unmap wait thread. To avoid this
1657 1674           * race we set nounmapwait flag in as structure if we drop softlockcnt
1658 1675           * to 0 if async is not 0.  unmapwait thread
1659 1676           * will not block if this flag is set.
1660 1677           */
1661 1678          if (async)
1662 1679                  mutex_enter(&shmd->shm_segfree_syncmtx);
1663 1680  
1664 1681          /*
1665 1682           * Now decrement softlockcnt.
1666 1683           */
1667 1684          ASSERT(shmd->shm_softlockcnt > 0);
1668 1685          atomic_dec_ulong((ulong_t *)(&(shmd->shm_softlockcnt)));
1669 1686  
1670 1687          if (shmd->shm_softlockcnt <= 0) {
1671 1688                  if (async || AS_ISUNMAPWAIT(seg->s_as)) {
1672 1689                          mutex_enter(&seg->s_as->a_contents);
1673 1690                          if (async)
1674 1691                                  AS_SETNOUNMAPWAIT(seg->s_as);
1675 1692                          if (AS_ISUNMAPWAIT(seg->s_as)) {
1676 1693                                  AS_CLRUNMAPWAIT(seg->s_as);
1677 1694                                  cv_broadcast(&seg->s_as->a_cv);
1678 1695                          }
1679 1696                          mutex_exit(&seg->s_as->a_contents);
1680 1697                  }
1681 1698          }
1682 1699  
1683 1700          if (async)
1684 1701                  mutex_exit(&shmd->shm_segfree_syncmtx);
1685 1702  
1686 1703          return (done);
1687 1704  }
1688 1705  
1689 1706  /*
1690 1707   * Do a F_SOFTUNLOCK call over the range requested.
1691 1708   * The range must have already been F_SOFTLOCK'ed.
1692 1709   *
1693 1710   * The calls to acquire and release the anon map lock mutex were
1694 1711   * removed in order to avoid a deadly embrace during a DR
1695 1712   * memory delete operation.  (Eg. DR blocks while waiting for a
1696 1713   * exclusive lock on a page that is being used for kaio; the
1697 1714   * thread that will complete the kaio and call segspt_softunlock
1698 1715   * blocks on the anon map lock; another thread holding the anon
1699 1716   * map lock blocks on another page lock via the segspt_shmfault
1700 1717   * -> page_lookup -> page_lookup_create -> page_lock_es code flow.)
1701 1718   *
1702 1719   * The appropriateness of the removal is based upon the following:
1703 1720   * 1. If we are holding a segment's reader lock and the page is held
1704 1721   * shared, then the corresponding element in anonmap which points to
1705 1722   * anon struct cannot change and there is no need to acquire the
1706 1723   * anonymous map lock.
1707 1724   * 2. Threads in segspt_softunlock have a reader lock on the segment
1708 1725   * and already have the shared page lock, so we are guaranteed that
1709 1726   * the anon map slot cannot change and therefore can call anon_get_ptr()
1710 1727   * without grabbing the anonymous map lock.
1711 1728   * 3. Threads that softlock a shared page break copy-on-write, even if
1712 1729   * its a read.  Thus cow faults can be ignored with respect to soft
1713 1730   * unlocking, since the breaking of cow means that the anon slot(s) will
1714 1731   * not be shared.
1715 1732   */
1716 1733  static void
1717 1734  segspt_softunlock(struct seg *seg, caddr_t sptseg_addr,
1718 1735      size_t len, enum seg_rw rw)
1719 1736  {
1720 1737          struct shm_data *shmd = (struct shm_data *)seg->s_data;
1721 1738          struct seg      *sptseg;
1722 1739          struct spt_data *sptd;
1723 1740          page_t *pp;
1724 1741          caddr_t adr;
1725 1742          struct vnode *vp;
1726 1743          u_offset_t offset;
1727 1744          ulong_t anon_index;
1728 1745          struct anon_map *amp;           /* XXX - for locknest */
1729 1746          struct anon *ap = NULL;
1730 1747          pgcnt_t npages;
1731 1748  
1732 1749          ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
1733 1750  
1734 1751          sptseg = shmd->shm_sptseg;
1735 1752          sptd = sptseg->s_data;
1736 1753  
1737 1754          /*
1738 1755           * Some platforms assume that ISM mappings are HAT_LOAD_LOCK
1739 1756           * and therefore their pages are SE_SHARED locked
1740 1757           * for the entire life of the segment.
1741 1758           */
1742 1759          if ((!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) &&
1743 1760              ((sptd->spt_flags & SHM_PAGEABLE) == 0)) {
1744 1761                  goto softlock_decrement;
1745 1762          }
1746 1763  
1747 1764          /*
1748 1765           * Any thread is free to do a page_find and
1749 1766           * page_unlock() on the pages within this seg.
1750 1767           *
1751 1768           * We are already holding the as->a_lock on the user's
1752 1769           * real segment, but we need to hold the a_lock on the
1753 1770           * underlying dummy as. This is mostly to satisfy the
1754 1771           * underlying HAT layer.
1755 1772           */
1756 1773          AS_LOCK_ENTER(sptseg->s_as, RW_READER);
1757 1774          hat_unlock(sptseg->s_as->a_hat, sptseg_addr, len);
1758 1775          AS_LOCK_EXIT(sptseg->s_as);
1759 1776  
1760 1777          amp = sptd->spt_amp;
1761 1778          ASSERT(amp != NULL);
1762 1779          anon_index = seg_page(sptseg, sptseg_addr);
1763 1780  
1764 1781          for (adr = sptseg_addr; adr < sptseg_addr + len; adr += PAGESIZE) {
1765 1782                  ap = anon_get_ptr(amp->ahp, anon_index++);
1766 1783                  ASSERT(ap != NULL);
1767 1784                  swap_xlate(ap, &vp, &offset);
1768 1785  
1769 1786                  /*
1770 1787                   * Use page_find() instead of page_lookup() to
1771 1788                   * find the page since we know that it has a
1772 1789                   * "shared" lock.
1773 1790                   */
1774 1791                  pp = page_find(vp, offset);
1775 1792                  ASSERT(ap == anon_get_ptr(amp->ahp, anon_index - 1));
1776 1793                  if (pp == NULL) {
1777 1794                          panic("segspt_softunlock: "
1778 1795                              "addr %p, ap %p, vp %p, off %llx",
1779 1796                              (void *)adr, (void *)ap, (void *)vp, offset);
1780 1797                          /*NOTREACHED*/
1781 1798                  }
1782 1799  
1783 1800                  if (rw == S_WRITE) {
1784 1801                          hat_setrefmod(pp);
1785 1802                  } else if (rw != S_OTHER) {
1786 1803                          hat_setref(pp);
1787 1804                  }
1788 1805                  page_unlock(pp);
1789 1806          }
1790 1807  
1791 1808  softlock_decrement:
1792 1809          npages = btopr(len);
1793 1810          ASSERT(shmd->shm_softlockcnt >= npages);
1794 1811          atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), -npages);
1795 1812          if (shmd->shm_softlockcnt == 0) {
1796 1813                  /*
1797 1814                   * All SOFTLOCKS are gone. Wakeup any waiting
1798 1815                   * unmappers so they can try again to unmap.
1799 1816                   * Check for waiters first without the mutex
1800 1817                   * held so we don't always grab the mutex on
1801 1818                   * softunlocks.
1802 1819                   */
1803 1820                  if (AS_ISUNMAPWAIT(seg->s_as)) {
1804 1821                          mutex_enter(&seg->s_as->a_contents);
1805 1822                          if (AS_ISUNMAPWAIT(seg->s_as)) {
1806 1823                                  AS_CLRUNMAPWAIT(seg->s_as);
1807 1824                                  cv_broadcast(&seg->s_as->a_cv);
1808 1825                          }
1809 1826                          mutex_exit(&seg->s_as->a_contents);
1810 1827                  }
1811 1828          }
1812 1829  }
1813 1830  
1814 1831  int
1815 1832  segspt_shmattach(struct seg **segpp, void *argsp)
1816 1833  {
1817 1834          struct seg *seg = *segpp;
1818 1835          struct shm_data *shmd_arg = (struct shm_data *)argsp;
1819 1836          struct shm_data *shmd;
1820 1837          struct anon_map *shm_amp = shmd_arg->shm_amp;
1821 1838          struct spt_data *sptd;
1822 1839          int error = 0;
1823 1840  
1824 1841          ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
1825 1842  
1826 1843          shmd = kmem_zalloc((sizeof (*shmd)), KM_NOSLEEP);
1827 1844          if (shmd == NULL)
1828 1845                  return (ENOMEM);
1829 1846  
1830 1847          shmd->shm_sptas = shmd_arg->shm_sptas;
1831 1848          shmd->shm_amp = shm_amp;
1832 1849          shmd->shm_sptseg = shmd_arg->shm_sptseg;
1833 1850  
1834 1851          (void) lgrp_shm_policy_set(LGRP_MEM_POLICY_DEFAULT, shm_amp, 0,
1835 1852              NULL, 0, seg->s_size);
1836 1853  
1837 1854          mutex_init(&shmd->shm_segfree_syncmtx, NULL, MUTEX_DEFAULT, NULL);
1838 1855  
1839 1856          seg->s_data = (void *)shmd;
1840 1857          seg->s_ops = &segspt_shmops;
1841 1858          seg->s_szc = shmd->shm_sptseg->s_szc;
1842 1859          sptd = shmd->shm_sptseg->s_data;
1843 1860  
1844 1861          if (sptd->spt_flags & SHM_PAGEABLE) {
1845 1862                  if ((shmd->shm_vpage = kmem_zalloc(btopr(shm_amp->size),
1846 1863                      KM_NOSLEEP)) == NULL) {
1847 1864                          seg->s_data = (void *)NULL;
1848 1865                          kmem_free(shmd, (sizeof (*shmd)));
1849 1866                          return (ENOMEM);
1850 1867                  }
1851 1868                  shmd->shm_lckpgs = 0;
1852 1869                  if (hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) {
1853 1870                          if ((error = hat_share(seg->s_as->a_hat, seg->s_base,
1854 1871                              shmd_arg->shm_sptas->a_hat, SEGSPTADDR,
1855 1872                              seg->s_size, seg->s_szc)) != 0) {
1856 1873                                  kmem_free(shmd->shm_vpage,
1857 1874                                      btopr(shm_amp->size));
1858 1875                          }
1859 1876                  }
1860 1877          } else {
1861 1878                  error = hat_share(seg->s_as->a_hat, seg->s_base,
1862 1879                      shmd_arg->shm_sptas->a_hat, SEGSPTADDR,
1863 1880                      seg->s_size, seg->s_szc);
1864 1881          }
1865 1882          if (error) {
1866 1883                  seg->s_szc = 0;
1867 1884                  seg->s_data = (void *)NULL;
1868 1885                  kmem_free(shmd, (sizeof (*shmd)));
1869 1886          } else {
1870 1887                  ANON_LOCK_ENTER(&shm_amp->a_rwlock, RW_WRITER);
1871 1888                  shm_amp->refcnt++;
1872 1889                  ANON_LOCK_EXIT(&shm_amp->a_rwlock);
1873 1890          }
1874 1891          return (error);
1875 1892  }
1876 1893  
1877 1894  int
1878 1895  segspt_shmunmap(struct seg *seg, caddr_t raddr, size_t ssize)
1879 1896  {
1880 1897          struct shm_data *shmd = (struct shm_data *)seg->s_data;
1881 1898          int reclaim = 1;
1882 1899  
1883 1900          ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
1884 1901  retry:
1885 1902          if (shmd->shm_softlockcnt > 0) {
1886 1903                  if (reclaim == 1) {
1887 1904                          segspt_purge(seg);
1888 1905                          reclaim = 0;
1889 1906                          goto retry;
1890 1907                  }
1891 1908                  return (EAGAIN);
1892 1909          }
1893 1910  
1894 1911          if (ssize != seg->s_size) {
1895 1912  #ifdef DEBUG
1896 1913                  cmn_err(CE_WARN, "Incompatible ssize %lx s_size %lx\n",
1897 1914                      ssize, seg->s_size);
1898 1915  #endif
1899 1916                  return (EINVAL);
1900 1917          }
1901 1918  
1902 1919          (void) segspt_shmlockop(seg, raddr, shmd->shm_amp->size, 0, MC_UNLOCK,
1903 1920              NULL, 0);
1904 1921          hat_unshare(seg->s_as->a_hat, raddr, ssize, seg->s_szc);
1905 1922  
1906 1923          seg_free(seg);
1907 1924  
1908 1925          return (0);
1909 1926  }
1910 1927  
1911 1928  void
1912 1929  segspt_shmfree(struct seg *seg)
1913 1930  {
1914 1931          struct shm_data *shmd = (struct shm_data *)seg->s_data;
1915 1932          struct anon_map *shm_amp = shmd->shm_amp;
1916 1933  
1917 1934          ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
1918 1935  
1919 1936          (void) segspt_shmlockop(seg, seg->s_base, shm_amp->size, 0,
1920 1937              MC_UNLOCK, NULL, 0);
1921 1938  
1922 1939          /*
1923 1940           * Need to increment refcnt when attaching
1924 1941           * and decrement when detaching because of dup().
1925 1942           */
1926 1943          ANON_LOCK_ENTER(&shm_amp->a_rwlock, RW_WRITER);
1927 1944          shm_amp->refcnt--;
1928 1945          ANON_LOCK_EXIT(&shm_amp->a_rwlock);
1929 1946  
1930 1947          if (shmd->shm_vpage) {  /* only for DISM */
1931 1948                  kmem_free(shmd->shm_vpage, btopr(shm_amp->size));
1932 1949                  shmd->shm_vpage = NULL;
1933 1950          }
1934 1951  
1935 1952          /*
1936 1953           * Take shm_segfree_syncmtx lock to let segspt_reclaim() finish if it's
1937 1954           * still working with this segment without holding as lock.
1938 1955           */
1939 1956          ASSERT(shmd->shm_softlockcnt == 0);
1940 1957          mutex_enter(&shmd->shm_segfree_syncmtx);
1941 1958          mutex_destroy(&shmd->shm_segfree_syncmtx);
1942 1959  
1943 1960          kmem_free(shmd, sizeof (*shmd));
1944 1961  }
1945 1962  
1946 1963  /*ARGSUSED*/
1947 1964  int
1948 1965  segspt_shmsetprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
1949 1966  {
1950 1967          ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
1951 1968  
1952 1969          /*
1953 1970           * Shared page table is more than shared mapping.
1954 1971           *  Individual process sharing page tables can't change prot
1955 1972           *  because there is only one set of page tables.
1956 1973           *  This will be allowed after private page table is
1957 1974           *  supported.
1958 1975           */
1959 1976  /* need to return correct status error? */
1960 1977          return (0);
1961 1978  }
1962 1979  
1963 1980  
1964 1981  faultcode_t
1965 1982  segspt_dismfault(struct hat *hat, struct seg *seg, caddr_t addr,
1966 1983      size_t len, enum fault_type type, enum seg_rw rw)
1967 1984  {
1968 1985          struct  shm_data        *shmd = (struct shm_data *)seg->s_data;
1969 1986          struct  seg             *sptseg = shmd->shm_sptseg;
1970 1987          struct  as              *curspt = shmd->shm_sptas;
1971 1988          struct  spt_data        *sptd = sptseg->s_data;
1972 1989          pgcnt_t npages;
1973 1990          size_t  size;
1974 1991          caddr_t segspt_addr, shm_addr;
1975 1992          page_t  **ppa;
1976 1993          int     i;
1977 1994          ulong_t an_idx = 0;
1978 1995          int     err = 0;
1979 1996          int     dyn_ism_unmap = hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0);
1980 1997          size_t  pgsz;
1981 1998          pgcnt_t pgcnt;
1982 1999          caddr_t a;
1983 2000          pgcnt_t pidx;
1984 2001  
1985 2002  #ifdef lint
1986 2003          hat = hat;
1987 2004  #endif
1988 2005          ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
1989 2006  
1990 2007          /*
1991 2008           * Because of the way spt is implemented
1992 2009           * the realsize of the segment does not have to be
1993 2010           * equal to the segment size itself. The segment size is
1994 2011           * often in multiples of a page size larger than PAGESIZE.
1995 2012           * The realsize is rounded up to the nearest PAGESIZE
1996 2013           * based on what the user requested. This is a bit of
1997 2014           * ungliness that is historical but not easily fixed
1998 2015           * without re-designing the higher levels of ISM.
1999 2016           */
2000 2017          ASSERT(addr >= seg->s_base);
2001 2018          if (((addr + len) - seg->s_base) > sptd->spt_realsize)
2002 2019                  return (FC_NOMAP);
2003 2020          /*
2004 2021           * For all of the following cases except F_PROT, we need to
2005 2022           * make any necessary adjustments to addr and len
2006 2023           * and get all of the necessary page_t's into an array called ppa[].
2007 2024           *
2008 2025           * The code in shmat() forces base addr and len of ISM segment
2009 2026           * to be aligned to largest page size supported. Therefore,
2010 2027           * we are able to handle F_SOFTLOCK and F_INVAL calls in "large
2011 2028           * pagesize" chunks. We want to make sure that we HAT_LOAD_LOCK
2012 2029           * in large pagesize chunks, or else we will screw up the HAT
2013 2030           * layer by calling hat_memload_array() with differing page sizes
2014 2031           * over a given virtual range.
2015 2032           */
2016 2033          pgsz = page_get_pagesize(sptseg->s_szc);
2017 2034          pgcnt = page_get_pagecnt(sptseg->s_szc);
2018 2035          shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz);
2019 2036          size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)), pgsz);
2020 2037          npages = btopr(size);
2021 2038  
2022 2039          /*
2023 2040           * Now we need to convert from addr in segshm to addr in segspt.
2024 2041           */
2025 2042          an_idx = seg_page(seg, shm_addr);
2026 2043          segspt_addr = sptseg->s_base + ptob(an_idx);
2027 2044  
2028 2045          ASSERT((segspt_addr + ptob(npages)) <=
2029 2046              (sptseg->s_base + sptd->spt_realsize));
2030 2047          ASSERT(segspt_addr < (sptseg->s_base + sptseg->s_size));
2031 2048  
2032 2049          switch (type) {
2033 2050  
2034 2051          case F_SOFTLOCK:
2035 2052  
2036 2053                  atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), npages);
2037 2054                  /*
2038 2055                   * Fall through to the F_INVAL case to load up the hat layer
2039 2056                   * entries with the HAT_LOAD_LOCK flag.
2040 2057                   */
2041 2058                  /* FALLTHRU */
2042 2059          case F_INVAL:
2043 2060  
2044 2061                  if ((rw == S_EXEC) && !(sptd->spt_prot & PROT_EXEC))
2045 2062                          return (FC_NOMAP);
2046 2063  
2047 2064                  ppa = kmem_zalloc(npages * sizeof (page_t *), KM_SLEEP);
2048 2065  
2049 2066                  err = spt_anon_getpages(sptseg, segspt_addr, size, ppa);
2050 2067                  if (err != 0) {
2051 2068                          if (type == F_SOFTLOCK) {
2052 2069                                  atomic_add_long((ulong_t *)(
2053 2070                                      &(shmd->shm_softlockcnt)), -npages);
2054 2071                          }
2055 2072                          goto dism_err;
2056 2073                  }
2057 2074                  AS_LOCK_ENTER(sptseg->s_as, RW_READER);
2058 2075                  a = segspt_addr;
2059 2076                  pidx = 0;
2060 2077                  if (type == F_SOFTLOCK) {
2061 2078  
2062 2079                          /*
2063 2080                           * Load up the translation keeping it
2064 2081                           * locked and don't unlock the page.
2065 2082                           */
2066 2083                          for (; pidx < npages; a += pgsz, pidx += pgcnt) {
2067 2084                                  hat_memload_array(sptseg->s_as->a_hat,
2068 2085                                      a, pgsz, &ppa[pidx], sptd->spt_prot,
2069 2086                                      HAT_LOAD_LOCK | HAT_LOAD_SHARE);
2070 2087                          }
2071 2088                  } else {
2072 2089                          /*
2073 2090                           * Migrate pages marked for migration
2074 2091                           */
2075 2092                          if (lgrp_optimizations())
2076 2093                                  page_migrate(seg, shm_addr, ppa, npages);
2077 2094  
2078 2095                          for (; pidx < npages; a += pgsz, pidx += pgcnt) {
2079 2096                                  hat_memload_array(sptseg->s_as->a_hat,
2080 2097                                      a, pgsz, &ppa[pidx],
2081 2098                                      sptd->spt_prot,
2082 2099                                      HAT_LOAD_SHARE);
2083 2100                          }
2084 2101  
2085 2102                          /*
2086 2103                           * And now drop the SE_SHARED lock(s).
2087 2104                           */
2088 2105                          if (dyn_ism_unmap) {
2089 2106                                  for (i = 0; i < npages; i++) {
2090 2107                                          page_unlock(ppa[i]);
2091 2108                                  }
2092 2109                          }
2093 2110                  }
2094 2111  
2095 2112                  if (!dyn_ism_unmap) {
2096 2113                          if (hat_share(seg->s_as->a_hat, shm_addr,
2097 2114                              curspt->a_hat, segspt_addr, ptob(npages),
2098 2115                              seg->s_szc) != 0) {
2099 2116                                  panic("hat_share err in DISM fault");
2100 2117                                  /* NOTREACHED */
2101 2118                          }
2102 2119                          if (type == F_INVAL) {
2103 2120                                  for (i = 0; i < npages; i++) {
2104 2121                                          page_unlock(ppa[i]);
2105 2122                                  }
2106 2123                          }
2107 2124                  }
2108 2125                  AS_LOCK_EXIT(sptseg->s_as);
2109 2126  dism_err:
2110 2127                  kmem_free(ppa, npages * sizeof (page_t *));
2111 2128                  return (err);
2112 2129  
2113 2130          case F_SOFTUNLOCK:
2114 2131  
2115 2132                  /*
2116 2133                   * This is a bit ugly, we pass in the real seg pointer,
2117 2134                   * but the segspt_addr is the virtual address within the
2118 2135                   * dummy seg.
2119 2136                   */
2120 2137                  segspt_softunlock(seg, segspt_addr, size, rw);
2121 2138                  return (0);
2122 2139  
2123 2140          case F_PROT:
2124 2141  
2125 2142                  /*
2126 2143                   * This takes care of the unusual case where a user
2127 2144                   * allocates a stack in shared memory and a register
2128 2145                   * window overflow is written to that stack page before
2129 2146                   * it is otherwise modified.
2130 2147                   *
2131 2148                   * We can get away with this because ISM segments are
2132 2149                   * always rw. Other than this unusual case, there
2133 2150                   * should be no instances of protection violations.
2134 2151                   */
2135 2152                  return (0);
2136 2153  
2137 2154          default:
2138 2155  #ifdef DEBUG
2139 2156                  panic("segspt_dismfault default type?");
2140 2157  #else
2141 2158                  return (FC_NOMAP);
2142 2159  #endif
2143 2160          }
2144 2161  }
2145 2162  
2146 2163  
2147 2164  faultcode_t
2148 2165  segspt_shmfault(struct hat *hat, struct seg *seg, caddr_t addr,
2149 2166      size_t len, enum fault_type type, enum seg_rw rw)
2150 2167  {
2151 2168          struct shm_data         *shmd = (struct shm_data *)seg->s_data;
2152 2169          struct seg              *sptseg = shmd->shm_sptseg;
2153 2170          struct as               *curspt = shmd->shm_sptas;
2154 2171          struct spt_data         *sptd = sptseg->s_data;
2155 2172          pgcnt_t npages;
2156 2173          size_t size;
2157 2174          caddr_t sptseg_addr, shm_addr;
2158 2175          page_t *pp, **ppa;
2159 2176          int     i;
2160 2177          u_offset_t offset;
2161 2178          ulong_t anon_index = 0;
2162 2179          struct vnode *vp;
2163 2180          struct anon_map *amp;           /* XXX - for locknest */
2164 2181          struct anon *ap = NULL;
2165 2182          size_t          pgsz;
2166 2183          pgcnt_t         pgcnt;
2167 2184          caddr_t         a;
2168 2185          pgcnt_t         pidx;
2169 2186          size_t          sz;
2170 2187  
2171 2188  #ifdef lint
2172 2189          hat = hat;
2173 2190  #endif
2174 2191  
2175 2192          ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
2176 2193  
2177 2194          if (sptd->spt_flags & SHM_PAGEABLE) {
2178 2195                  return (segspt_dismfault(hat, seg, addr, len, type, rw));
2179 2196          }
2180 2197  
2181 2198          /*
2182 2199           * Because of the way spt is implemented
2183 2200           * the realsize of the segment does not have to be
2184 2201           * equal to the segment size itself. The segment size is
2185 2202           * often in multiples of a page size larger than PAGESIZE.
2186 2203           * The realsize is rounded up to the nearest PAGESIZE
2187 2204           * based on what the user requested. This is a bit of
2188 2205           * ungliness that is historical but not easily fixed
2189 2206           * without re-designing the higher levels of ISM.
2190 2207           */
2191 2208          ASSERT(addr >= seg->s_base);
2192 2209          if (((addr + len) - seg->s_base) > sptd->spt_realsize)
2193 2210                  return (FC_NOMAP);
2194 2211          /*
2195 2212           * For all of the following cases except F_PROT, we need to
2196 2213           * make any necessary adjustments to addr and len
2197 2214           * and get all of the necessary page_t's into an array called ppa[].
2198 2215           *
2199 2216           * The code in shmat() forces base addr and len of ISM segment
2200 2217           * to be aligned to largest page size supported. Therefore,
2201 2218           * we are able to handle F_SOFTLOCK and F_INVAL calls in "large
2202 2219           * pagesize" chunks. We want to make sure that we HAT_LOAD_LOCK
2203 2220           * in large pagesize chunks, or else we will screw up the HAT
2204 2221           * layer by calling hat_memload_array() with differing page sizes
2205 2222           * over a given virtual range.
2206 2223           */
2207 2224          pgsz = page_get_pagesize(sptseg->s_szc);
2208 2225          pgcnt = page_get_pagecnt(sptseg->s_szc);
2209 2226          shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz);
2210 2227          size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)), pgsz);
2211 2228          npages = btopr(size);
2212 2229  
2213 2230          /*
2214 2231           * Now we need to convert from addr in segshm to addr in segspt.
2215 2232           */
2216 2233          anon_index = seg_page(seg, shm_addr);
2217 2234          sptseg_addr = sptseg->s_base + ptob(anon_index);
2218 2235  
2219 2236          /*
2220 2237           * And now we may have to adjust npages downward if we have
2221 2238           * exceeded the realsize of the segment or initial anon
2222 2239           * allocations.
2223 2240           */
2224 2241          if ((sptseg_addr + ptob(npages)) >
2225 2242              (sptseg->s_base + sptd->spt_realsize))
2226 2243                  size = (sptseg->s_base + sptd->spt_realsize) - sptseg_addr;
2227 2244  
2228 2245          npages = btopr(size);
2229 2246  
2230 2247          ASSERT(sptseg_addr < (sptseg->s_base + sptseg->s_size));
2231 2248          ASSERT((sptd->spt_flags & SHM_PAGEABLE) == 0);
2232 2249  
2233 2250          switch (type) {
2234 2251  
2235 2252          case F_SOFTLOCK:
2236 2253  
2237 2254                  /*
2238 2255                   * availrmem is decremented once during anon_swap_adjust()
2239 2256                   * and is incremented during the anon_unresv(), which is
2240 2257                   * called from shm_rm_amp() when the segment is destroyed.
2241 2258                   */
2242 2259                  atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), npages);
2243 2260                  /*
2244 2261                   * Some platforms assume that ISM pages are SE_SHARED
2245 2262                   * locked for the entire life of the segment.
2246 2263                   */
2247 2264                  if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0))
2248 2265                          return (0);
2249 2266                  /*
2250 2267                   * Fall through to the F_INVAL case to load up the hat layer
2251 2268                   * entries with the HAT_LOAD_LOCK flag.
2252 2269                   */
2253 2270  
2254 2271                  /* FALLTHRU */
2255 2272          case F_INVAL:
2256 2273  
2257 2274                  if ((rw == S_EXEC) && !(sptd->spt_prot & PROT_EXEC))
2258 2275                          return (FC_NOMAP);
2259 2276  
2260 2277                  /*
2261 2278                   * Some platforms that do NOT support DYNAMIC_ISM_UNMAP
2262 2279                   * may still rely on this call to hat_share(). That
2263 2280                   * would imply that those hat's can fault on a
2264 2281                   * HAT_LOAD_LOCK translation, which would seem
2265 2282                   * contradictory.
2266 2283                   */
2267 2284                  if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) {
2268 2285                          if (hat_share(seg->s_as->a_hat, seg->s_base,
2269 2286                              curspt->a_hat, sptseg->s_base,
2270 2287                              sptseg->s_size, sptseg->s_szc) != 0) {
2271 2288                                  panic("hat_share error in ISM fault");
2272 2289                                  /*NOTREACHED*/
2273 2290                          }
2274 2291                          return (0);
2275 2292                  }
2276 2293                  ppa = kmem_zalloc(sizeof (page_t *) * npages, KM_SLEEP);
2277 2294  
2278 2295                  /*
2279 2296                   * I see no need to lock the real seg,
2280 2297                   * here, because all of our work will be on the underlying
2281 2298                   * dummy seg.
2282 2299                   *
2283 2300                   * sptseg_addr and npages now account for large pages.
2284 2301                   */
2285 2302                  amp = sptd->spt_amp;
2286 2303                  ASSERT(amp != NULL);
2287 2304                  anon_index = seg_page(sptseg, sptseg_addr);
2288 2305  
2289 2306                  ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
2290 2307                  for (i = 0; i < npages; i++) {
2291 2308                          ap = anon_get_ptr(amp->ahp, anon_index++);
2292 2309                          ASSERT(ap != NULL);
2293 2310                          swap_xlate(ap, &vp, &offset);
2294 2311                          pp = page_lookup(vp, offset, SE_SHARED);
2295 2312                          ASSERT(pp != NULL);
2296 2313                          ppa[i] = pp;
2297 2314                  }
2298 2315                  ANON_LOCK_EXIT(&amp->a_rwlock);
2299 2316                  ASSERT(i == npages);
2300 2317  
2301 2318                  /*
2302 2319                   * We are already holding the as->a_lock on the user's
2303 2320                   * real segment, but we need to hold the a_lock on the
2304 2321                   * underlying dummy as. This is mostly to satisfy the
2305 2322                   * underlying HAT layer.
2306 2323                   */
2307 2324                  AS_LOCK_ENTER(sptseg->s_as, RW_READER);
2308 2325                  a = sptseg_addr;
2309 2326                  pidx = 0;
2310 2327                  if (type == F_SOFTLOCK) {
2311 2328                          /*
2312 2329                           * Load up the translation keeping it
2313 2330                           * locked and don't unlock the page.
2314 2331                           */
2315 2332                          for (; pidx < npages; a += pgsz, pidx += pgcnt) {
2316 2333                                  sz = MIN(pgsz, ptob(npages - pidx));
2317 2334                                  hat_memload_array(sptseg->s_as->a_hat, a,
2318 2335                                      sz, &ppa[pidx], sptd->spt_prot,
2319 2336                                      HAT_LOAD_LOCK | HAT_LOAD_SHARE);
2320 2337                          }
2321 2338                  } else {
2322 2339                          /*
2323 2340                           * Migrate pages marked for migration.
2324 2341                           */
2325 2342                          if (lgrp_optimizations())
2326 2343                                  page_migrate(seg, shm_addr, ppa, npages);
2327 2344  
2328 2345                          for (; pidx < npages; a += pgsz, pidx += pgcnt) {
2329 2346                                  sz = MIN(pgsz, ptob(npages - pidx));
2330 2347                                  hat_memload_array(sptseg->s_as->a_hat,
2331 2348                                      a, sz, &ppa[pidx],
2332 2349                                      sptd->spt_prot, HAT_LOAD_SHARE);
2333 2350                          }
2334 2351  
2335 2352                          /*
2336 2353                           * And now drop the SE_SHARED lock(s).
2337 2354                           */
2338 2355                          for (i = 0; i < npages; i++)
2339 2356                                  page_unlock(ppa[i]);
2340 2357                  }
2341 2358                  AS_LOCK_EXIT(sptseg->s_as);
2342 2359  
2343 2360                  kmem_free(ppa, sizeof (page_t *) * npages);
2344 2361                  return (0);
2345 2362          case F_SOFTUNLOCK:
2346 2363  
2347 2364                  /*
2348 2365                   * This is a bit ugly, we pass in the real seg pointer,
2349 2366                   * but the sptseg_addr is the virtual address within the
2350 2367                   * dummy seg.
2351 2368                   */
2352 2369                  segspt_softunlock(seg, sptseg_addr, ptob(npages), rw);
2353 2370                  return (0);
2354 2371  
2355 2372          case F_PROT:
2356 2373  
2357 2374                  /*
2358 2375                   * This takes care of the unusual case where a user
2359 2376                   * allocates a stack in shared memory and a register
2360 2377                   * window overflow is written to that stack page before
2361 2378                   * it is otherwise modified.
2362 2379                   *
2363 2380                   * We can get away with this because ISM segments are
2364 2381                   * always rw. Other than this unusual case, there
2365 2382                   * should be no instances of protection violations.
2366 2383                   */
2367 2384                  return (0);
2368 2385  
2369 2386          default:
2370 2387  #ifdef DEBUG
2371 2388                  cmn_err(CE_WARN, "segspt_shmfault default type?");
2372 2389  #endif
2373 2390                  return (FC_NOMAP);
2374 2391          }
2375 2392  }
2376 2393  
2377 2394  /*ARGSUSED*/
2378 2395  static faultcode_t
2379 2396  segspt_shmfaulta(struct seg *seg, caddr_t addr)
2380 2397  {
2381 2398          return (0);
2382 2399  }
2383 2400  
2384 2401  /*ARGSUSED*/
2385 2402  static int
2386 2403  segspt_shmkluster(struct seg *seg, caddr_t addr, ssize_t delta)
2387 2404  {
2388 2405          return (0);
2389 2406  }
2390 2407  
2391 2408  /*ARGSUSED*/
2392 2409  static size_t
2393 2410  segspt_shmswapout(struct seg *seg)
2394 2411  {
2395 2412          return (0);
2396 2413  }
2397 2414  
2398 2415  /*
2399 2416   * duplicate the shared page tables
2400 2417   */
2401 2418  int
2402 2419  segspt_shmdup(struct seg *seg, struct seg *newseg)
2403 2420  {
2404 2421          struct shm_data         *shmd = (struct shm_data *)seg->s_data;
2405 2422          struct anon_map         *amp = shmd->shm_amp;
2406 2423          struct shm_data         *shmd_new;
2407 2424          struct seg              *spt_seg = shmd->shm_sptseg;
2408 2425          struct spt_data         *sptd = spt_seg->s_data;
2409 2426          int                     error = 0;
2410 2427  
2411 2428          ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
2412 2429  
2413 2430          shmd_new = kmem_zalloc((sizeof (*shmd_new)), KM_SLEEP);
2414 2431          newseg->s_data = (void *)shmd_new;
2415 2432          shmd_new->shm_sptas = shmd->shm_sptas;
2416 2433          shmd_new->shm_amp = amp;
2417 2434          shmd_new->shm_sptseg = shmd->shm_sptseg;
2418 2435          newseg->s_ops = &segspt_shmops;
2419 2436          newseg->s_szc = seg->s_szc;
2420 2437          ASSERT(seg->s_szc == shmd->shm_sptseg->s_szc);
2421 2438  
2422 2439          ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
2423 2440          amp->refcnt++;
2424 2441          ANON_LOCK_EXIT(&amp->a_rwlock);
2425 2442  
2426 2443          if (sptd->spt_flags & SHM_PAGEABLE) {
2427 2444                  shmd_new->shm_vpage = kmem_zalloc(btopr(amp->size), KM_SLEEP);
2428 2445                  shmd_new->shm_lckpgs = 0;
2429 2446                  if (hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) {
2430 2447                          if ((error = hat_share(newseg->s_as->a_hat,
2431 2448                              newseg->s_base, shmd->shm_sptas->a_hat, SEGSPTADDR,
2432 2449                              seg->s_size, seg->s_szc)) != 0) {
2433 2450                                  kmem_free(shmd_new->shm_vpage,
2434 2451                                      btopr(amp->size));
2435 2452                          }
2436 2453                  }
2437 2454                  return (error);
2438 2455          } else {
2439 2456                  return (hat_share(newseg->s_as->a_hat, newseg->s_base,
2440 2457                      shmd->shm_sptas->a_hat, SEGSPTADDR, seg->s_size,
2441 2458                      seg->s_szc));
2442 2459  
2443 2460          }
2444 2461  }
2445 2462  
2446 2463  /*ARGSUSED*/
2447 2464  int
2448 2465  segspt_shmcheckprot(struct seg *seg, caddr_t addr, size_t size, uint_t prot)
2449 2466  {
2450 2467          struct shm_data *shmd = (struct shm_data *)seg->s_data;
2451 2468          struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
2452 2469  
2453 2470          ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
2454 2471  
2455 2472          /*
2456 2473           * ISM segment is always rw.
2457 2474           */
2458 2475          return (((sptd->spt_prot & prot) != prot) ? EACCES : 0);
2459 2476  }
2460 2477  
2461 2478  /*
2462 2479   * Return an array of locked large pages, for empty slots allocate
2463 2480   * private zero-filled anon pages.
2464 2481   */
2465 2482  static int
2466 2483  spt_anon_getpages(
2467 2484          struct seg *sptseg,
2468 2485          caddr_t sptaddr,
2469 2486          size_t len,
2470 2487          page_t *ppa[])
2471 2488  {
2472 2489          struct  spt_data *sptd = sptseg->s_data;
2473 2490          struct  anon_map *amp = sptd->spt_amp;
2474 2491          enum    seg_rw rw = sptd->spt_prot;
2475 2492          uint_t  szc = sptseg->s_szc;
2476 2493          size_t  pg_sz, share_sz = page_get_pagesize(szc);
2477 2494          pgcnt_t lp_npgs;
2478 2495          caddr_t lp_addr, e_sptaddr;
2479 2496          uint_t  vpprot, ppa_szc = 0;
2480 2497          struct  vpage *vpage = NULL;
2481 2498          ulong_t j, ppa_idx;
2482 2499          int     err, ierr = 0;
2483 2500          pgcnt_t an_idx;
2484 2501          anon_sync_obj_t cookie;
2485 2502          int anon_locked = 0;
2486 2503          pgcnt_t amp_pgs;
2487 2504  
2488 2505  
2489 2506          ASSERT(IS_P2ALIGNED(sptaddr, share_sz) && IS_P2ALIGNED(len, share_sz));
2490 2507          ASSERT(len != 0);
2491 2508  
2492 2509          pg_sz = share_sz;
2493 2510          lp_npgs = btop(pg_sz);
2494 2511          lp_addr = sptaddr;
2495 2512          e_sptaddr = sptaddr + len;
2496 2513          an_idx = seg_page(sptseg, sptaddr);
2497 2514          ppa_idx = 0;
2498 2515  
2499 2516          ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
2500 2517  
2501 2518          amp_pgs = page_get_pagecnt(amp->a_szc);
2502 2519  
2503 2520          /*CONSTCOND*/
2504 2521          while (1) {
2505 2522                  for (; lp_addr < e_sptaddr;
2506 2523                      an_idx += lp_npgs, lp_addr += pg_sz, ppa_idx += lp_npgs) {
2507 2524  
2508 2525                          /*
2509 2526                           * If we're currently locked, and we get to a new
2510 2527                           * page, unlock our current anon chunk.
2511 2528                           */
2512 2529                          if (anon_locked && P2PHASE(an_idx, amp_pgs) == 0) {
2513 2530                                  anon_array_exit(&cookie);
2514 2531                                  anon_locked = 0;
2515 2532                          }
2516 2533                          if (!anon_locked) {
2517 2534                                  anon_array_enter(amp, an_idx, &cookie);
2518 2535                                  anon_locked = 1;
2519 2536                          }
2520 2537                          ppa_szc = (uint_t)-1;
2521 2538                          ierr = anon_map_getpages(amp, an_idx, szc, sptseg,
2522 2539                              lp_addr, sptd->spt_prot, &vpprot, &ppa[ppa_idx],
2523 2540                              &ppa_szc, vpage, rw, 0, segvn_anypgsz, 0, kcred);
2524 2541  
2525 2542                          if (ierr != 0) {
2526 2543                                  if (ierr > 0) {
2527 2544                                          err = FC_MAKE_ERR(ierr);
2528 2545                                          goto lpgs_err;
2529 2546                                  }
2530 2547                                  break;
2531 2548                          }
2532 2549                  }
2533 2550                  if (lp_addr == e_sptaddr) {
2534 2551                          break;
2535 2552                  }
2536 2553                  ASSERT(lp_addr < e_sptaddr);
2537 2554  
2538 2555                  /*
2539 2556                   * ierr == -1 means we failed to allocate a large page.
2540 2557                   * so do a size down operation.
2541 2558                   *
2542 2559                   * ierr == -2 means some other process that privately shares
2543 2560                   * pages with this process has allocated a larger page and we
2544 2561                   * need to retry with larger pages. So do a size up
2545 2562                   * operation. This relies on the fact that large pages are
2546 2563                   * never partially shared i.e. if we share any constituent
2547 2564                   * page of a large page with another process we must share the
2548 2565                   * entire large page. Note this cannot happen for SOFTLOCK
2549 2566                   * case, unless current address (lpaddr) is at the beginning
2550 2567                   * of the next page size boundary because the other process
2551 2568                   * couldn't have relocated locked pages.
2552 2569                   */
2553 2570                  ASSERT(ierr == -1 || ierr == -2);
2554 2571                  if (segvn_anypgsz) {
2555 2572                          ASSERT(ierr == -2 || szc != 0);
2556 2573                          ASSERT(ierr == -1 || szc < sptseg->s_szc);
2557 2574                          szc = (ierr == -1) ? szc - 1 : szc + 1;
2558 2575                  } else {
2559 2576                          /*
2560 2577                           * For faults and segvn_anypgsz == 0
2561 2578                           * we need to be careful not to loop forever
2562 2579                           * if existing page is found with szc other
2563 2580                           * than 0 or seg->s_szc. This could be due
2564 2581                           * to page relocations on behalf of DR or
2565 2582                           * more likely large page creation. For this
2566 2583                           * case simply re-size to existing page's szc
2567 2584                           * if returned by anon_map_getpages().
2568 2585                           */
2569 2586                          if (ppa_szc == (uint_t)-1) {
2570 2587                                  szc = (ierr == -1) ? 0 : sptseg->s_szc;
2571 2588                          } else {
2572 2589                                  ASSERT(ppa_szc <= sptseg->s_szc);
2573 2590                                  ASSERT(ierr == -2 || ppa_szc < szc);
2574 2591                                  ASSERT(ierr == -1 || ppa_szc > szc);
2575 2592                                  szc = ppa_szc;
2576 2593                          }
2577 2594                  }
2578 2595                  pg_sz = page_get_pagesize(szc);
2579 2596                  lp_npgs = btop(pg_sz);
2580 2597                  ASSERT(IS_P2ALIGNED(lp_addr, pg_sz));
2581 2598          }
2582 2599          if (anon_locked) {
2583 2600                  anon_array_exit(&cookie);
2584 2601          }
2585 2602          ANON_LOCK_EXIT(&amp->a_rwlock);
2586 2603          return (0);
2587 2604  
2588 2605  lpgs_err:
2589 2606          if (anon_locked) {
2590 2607                  anon_array_exit(&cookie);
2591 2608          }
2592 2609          ANON_LOCK_EXIT(&amp->a_rwlock);
2593 2610          for (j = 0; j < ppa_idx; j++)
2594 2611                  page_unlock(ppa[j]);
2595 2612          return (err);
2596 2613  }
2597 2614  
2598 2615  /*
2599 2616   * count the number of bytes in a set of spt pages that are currently not
2600 2617   * locked
2601 2618   */
2602 2619  static rctl_qty_t
2603 2620  spt_unlockedbytes(pgcnt_t npages, page_t **ppa)
2604 2621  {
2605 2622          ulong_t i;
2606 2623          rctl_qty_t unlocked = 0;
2607 2624  
2608 2625          for (i = 0; i < npages; i++) {
2609 2626                  if (ppa[i]->p_lckcnt == 0)
2610 2627                          unlocked += PAGESIZE;
2611 2628          }
2612 2629          return (unlocked);
2613 2630  }
2614 2631  
2615 2632  extern  u_longlong_t randtick(void);
2616 2633  /* number of locks to reserve/skip by spt_lockpages() and spt_unlockpages() */
2617 2634  #define NLCK    (NCPU_P2)
2618 2635  /* Random number with a range [0, n-1], n must be power of two */
2619 2636  #define RAND_P2(n)      \
2620 2637          ((((long)curthread >> PTR24_LSB) ^ (long)randtick()) & ((n) - 1))
2621 2638  
2622 2639  int
2623 2640  spt_lockpages(struct seg *seg, pgcnt_t anon_index, pgcnt_t npages,
2624 2641      page_t **ppa, ulong_t *lockmap, size_t pos,
2625 2642      rctl_qty_t *locked)
2626 2643  {
2627 2644          struct  shm_data *shmd = seg->s_data;
2628 2645          struct  spt_data *sptd = shmd->shm_sptseg->s_data;
2629 2646          ulong_t i;
2630 2647          int     kernel;
2631 2648          pgcnt_t nlck = 0;
2632 2649          int     rv = 0;
2633 2650          int     use_reserved = 1;
2634 2651  
2635 2652          /* return the number of bytes actually locked */
2636 2653          *locked = 0;
2637 2654  
2638 2655          /*
2639 2656           * To avoid contention on freemem_lock, availrmem and pages_locked
2640 2657           * global counters are updated only every nlck locked pages instead of
2641 2658           * every time.  Reserve nlck locks up front and deduct from this
2642 2659           * reservation for each page that requires a lock.  When the reservation
2643 2660           * is consumed, reserve again.  nlck is randomized, so the competing
2644 2661           * threads do not fall into a cyclic lock contention pattern. When
2645 2662           * memory is low, the lock ahead is disabled, and instead page_pp_lock()
2646 2663           * is used to lock pages.
2647 2664           */
2648 2665          for (i = 0; i < npages; anon_index++, pos++, i++) {
2649 2666                  if (nlck == 0 && use_reserved == 1) {
2650 2667                          nlck = NLCK + RAND_P2(NLCK);
2651 2668                          /* if fewer loops left, decrease nlck */
2652 2669                          nlck = MIN(nlck, npages - i);
2653 2670                          /*
2654 2671                           * Reserve nlck locks up front and deduct from this
2655 2672                           * reservation for each page that requires a lock.  When
2656 2673                           * the reservation is consumed, reserve again.
2657 2674                           */
2658 2675                          mutex_enter(&freemem_lock);
2659 2676                          if ((availrmem - nlck) < pages_pp_maximum) {
2660 2677                                  /* Do not do advance memory reserves */
2661 2678                                  use_reserved = 0;
2662 2679                          } else {
2663 2680                                  availrmem       -= nlck;
2664 2681                                  pages_locked    += nlck;
2665 2682                          }
2666 2683                          mutex_exit(&freemem_lock);
2667 2684                  }
2668 2685                  if (!(shmd->shm_vpage[anon_index] & DISM_PG_LOCKED)) {
2669 2686                          if (sptd->spt_ppa_lckcnt[anon_index] <
2670 2687                              (ushort_t)DISM_LOCK_MAX) {
2671 2688                                  if (++sptd->spt_ppa_lckcnt[anon_index] ==
2672 2689                                      (ushort_t)DISM_LOCK_MAX) {
2673 2690                                          cmn_err(CE_WARN,
2674 2691                                              "DISM page lock limit "
2675 2692                                              "reached on DISM offset 0x%lx\n",
2676 2693                                              anon_index << PAGESHIFT);
2677 2694                                  }
2678 2695                                  kernel = (sptd->spt_ppa &&
2679 2696                                      sptd->spt_ppa[anon_index]);
2680 2697                                  if (!page_pp_lock(ppa[i], 0, kernel ||
2681 2698                                      use_reserved)) {
2682 2699                                          sptd->spt_ppa_lckcnt[anon_index]--;
2683 2700                                          rv = EAGAIN;
2684 2701                                          break;
2685 2702                                  }
2686 2703                                  /* if this is a newly locked page, count it */
2687 2704                                  if (ppa[i]->p_lckcnt == 1) {
2688 2705                                          if (kernel == 0 && use_reserved == 1)
2689 2706                                                  nlck--;
2690 2707                                          *locked += PAGESIZE;
2691 2708                                  }
2692 2709                                  shmd->shm_lckpgs++;
2693 2710                                  shmd->shm_vpage[anon_index] |= DISM_PG_LOCKED;
2694 2711                                  if (lockmap != NULL)
2695 2712                                          BT_SET(lockmap, pos);
2696 2713                          }
2697 2714                  }
2698 2715          }
2699 2716          /* Return unused lock reservation */
2700 2717          if (nlck != 0 && use_reserved == 1) {
2701 2718                  mutex_enter(&freemem_lock);
2702 2719                  availrmem       += nlck;
2703 2720                  pages_locked    -= nlck;
2704 2721                  mutex_exit(&freemem_lock);
2705 2722          }
2706 2723  
2707 2724          return (rv);
2708 2725  }
2709 2726  
2710 2727  int
2711 2728  spt_unlockpages(struct seg *seg, pgcnt_t anon_index, pgcnt_t npages,
2712 2729      rctl_qty_t *unlocked)
2713 2730  {
2714 2731          struct shm_data *shmd = seg->s_data;
2715 2732          struct spt_data *sptd = shmd->shm_sptseg->s_data;
2716 2733          struct anon_map *amp = sptd->spt_amp;
2717 2734          struct anon     *ap;
2718 2735          struct vnode    *vp;
2719 2736          u_offset_t      off;
2720 2737          struct page     *pp;
2721 2738          int             kernel;
2722 2739          anon_sync_obj_t cookie;
2723 2740          ulong_t         i;
2724 2741          pgcnt_t         nlck = 0;
2725 2742          pgcnt_t         nlck_limit = NLCK;
2726 2743  
2727 2744          ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
2728 2745          for (i = 0; i < npages; i++, anon_index++) {
2729 2746                  if (shmd->shm_vpage[anon_index] & DISM_PG_LOCKED) {
2730 2747                          anon_array_enter(amp, anon_index, &cookie);
2731 2748                          ap = anon_get_ptr(amp->ahp, anon_index);
2732 2749                          ASSERT(ap);
2733 2750  
2734 2751                          swap_xlate(ap, &vp, &off);
2735 2752                          anon_array_exit(&cookie);
2736 2753                          pp = page_lookup(vp, off, SE_SHARED);
2737 2754                          ASSERT(pp);
2738 2755                          /*
2739 2756                           * availrmem is decremented only for pages which are not
2740 2757                           * in seg pcache, for pages in seg pcache availrmem was
2741 2758                           * decremented in _dismpagelock()
2742 2759                           */
2743 2760                          kernel = (sptd->spt_ppa && sptd->spt_ppa[anon_index]);
2744 2761                          ASSERT(pp->p_lckcnt > 0);
2745 2762  
2746 2763                          /*
2747 2764                           * lock page but do not change availrmem, we do it
2748 2765                           * ourselves every nlck loops.
2749 2766                           */
2750 2767                          page_pp_unlock(pp, 0, 1);
2751 2768                          if (pp->p_lckcnt == 0) {
2752 2769                                  if (kernel == 0)
2753 2770                                          nlck++;
2754 2771                                  *unlocked += PAGESIZE;
2755 2772                          }
2756 2773                          page_unlock(pp);
2757 2774                          shmd->shm_vpage[anon_index] &= ~DISM_PG_LOCKED;
2758 2775                          sptd->spt_ppa_lckcnt[anon_index]--;
2759 2776                          shmd->shm_lckpgs--;
2760 2777                  }
2761 2778  
2762 2779                  /*
2763 2780                   * To reduce freemem_lock contention, do not update availrmem
2764 2781                   * until at least NLCK pages have been unlocked.
2765 2782                   * 1. No need to update if nlck is zero
2766 2783                   * 2. Always update if the last iteration
2767 2784                   */
2768 2785                  if (nlck > 0 && (nlck == nlck_limit || i == npages - 1)) {
2769 2786                          mutex_enter(&freemem_lock);
2770 2787                          availrmem       += nlck;
2771 2788                          pages_locked    -= nlck;
2772 2789                          mutex_exit(&freemem_lock);
2773 2790                          nlck = 0;
2774 2791                          nlck_limit = NLCK + RAND_P2(NLCK);
2775 2792                  }
2776 2793          }
2777 2794          ANON_LOCK_EXIT(&amp->a_rwlock);
2778 2795  
2779 2796          return (0);
2780 2797  }
2781 2798  
2782 2799  /*ARGSUSED*/
2783 2800  static int
2784 2801  segspt_shmlockop(struct seg *seg, caddr_t addr, size_t len,
2785 2802      int attr, int op, ulong_t *lockmap, size_t pos)
2786 2803  {
2787 2804          struct shm_data *shmd = seg->s_data;
2788 2805          struct seg      *sptseg = shmd->shm_sptseg;
2789 2806          struct spt_data *sptd = sptseg->s_data;
2790 2807          struct kshmid   *sp = sptd->spt_amp->a_sp;
2791 2808          pgcnt_t         npages, a_npages;
2792 2809          page_t          **ppa;
2793 2810          pgcnt_t         an_idx, a_an_idx, ppa_idx;
2794 2811          caddr_t         spt_addr, a_addr;       /* spt and aligned address */
2795 2812          size_t          a_len;                  /* aligned len */
2796 2813          size_t          share_sz;
2797 2814          ulong_t         i;
2798 2815          int             sts = 0;
2799 2816          rctl_qty_t      unlocked = 0;
2800 2817          rctl_qty_t      locked = 0;
2801 2818          struct proc     *p = curproc;
2802 2819          kproject_t      *proj;
2803 2820  
2804 2821          ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
2805 2822          ASSERT(sp != NULL);
2806 2823  
2807 2824          if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
2808 2825                  return (0);
2809 2826          }
2810 2827  
2811 2828          addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2812 2829          an_idx = seg_page(seg, addr);
2813 2830          npages = btopr(len);
2814 2831  
2815 2832          if (an_idx + npages > btopr(shmd->shm_amp->size)) {
2816 2833                  return (ENOMEM);
2817 2834          }
2818 2835  
2819 2836          /*
2820 2837           * A shm's project never changes, so no lock needed.
2821 2838           * The shm has a hold on the project, so it will not go away.
2822 2839           * Since we have a mapping to shm within this zone, we know
2823 2840           * that the zone will not go away.
2824 2841           */
2825 2842          proj = sp->shm_perm.ipc_proj;
2826 2843  
2827 2844          if (op == MC_LOCK) {
2828 2845  
2829 2846                  /*
2830 2847                   * Need to align addr and size request if they are not
2831 2848                   * aligned so we can always allocate large page(s) however
2832 2849                   * we only lock what was requested in initial request.
2833 2850                   */
2834 2851                  share_sz = page_get_pagesize(sptseg->s_szc);
2835 2852                  a_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), share_sz);
2836 2853                  a_len = P2ROUNDUP((uintptr_t)(((addr + len) - a_addr)),
2837 2854                      share_sz);
2838 2855                  a_npages = btop(a_len);
2839 2856                  a_an_idx = seg_page(seg, a_addr);
2840 2857                  spt_addr = sptseg->s_base + ptob(a_an_idx);
2841 2858                  ppa_idx = an_idx - a_an_idx;
2842 2859  
2843 2860                  if ((ppa = kmem_zalloc(((sizeof (page_t *)) * a_npages),
2844 2861                      KM_NOSLEEP)) == NULL) {
2845 2862                          return (ENOMEM);
2846 2863                  }
2847 2864  
2848 2865                  /*
2849 2866                   * Don't cache any new pages for IO and
2850 2867                   * flush any cached pages.
2851 2868                   */
2852 2869                  mutex_enter(&sptd->spt_lock);
2853 2870                  if (sptd->spt_ppa != NULL)
2854 2871                          sptd->spt_flags |= DISM_PPA_CHANGED;
2855 2872  
2856 2873                  sts = spt_anon_getpages(sptseg, spt_addr, a_len, ppa);
2857 2874                  if (sts != 0) {
2858 2875                          mutex_exit(&sptd->spt_lock);
2859 2876                          kmem_free(ppa, ((sizeof (page_t *)) * a_npages));
2860 2877                          return (sts);
2861 2878                  }
2862 2879  
2863 2880                  mutex_enter(&sp->shm_mlock);
2864 2881                  /* enforce locked memory rctl */
2865 2882                  unlocked = spt_unlockedbytes(npages, &ppa[ppa_idx]);
2866 2883  
2867 2884                  mutex_enter(&p->p_lock);
2868 2885                  if (rctl_incr_locked_mem(p, proj, unlocked, 0)) {
2869 2886                          mutex_exit(&p->p_lock);
2870 2887                          sts = EAGAIN;
2871 2888                  } else {
2872 2889                          mutex_exit(&p->p_lock);
2873 2890                          sts = spt_lockpages(seg, an_idx, npages,
2874 2891                              &ppa[ppa_idx], lockmap, pos, &locked);
2875 2892  
2876 2893                          /*
2877 2894                           * correct locked count if not all pages could be
2878 2895                           * locked
2879 2896                           */
2880 2897                          if ((unlocked - locked) > 0) {
2881 2898                                  rctl_decr_locked_mem(NULL, proj,
2882 2899                                      (unlocked - locked), 0);
2883 2900                          }
2884 2901                  }
2885 2902                  /*
2886 2903                   * unlock pages
2887 2904                   */
2888 2905                  for (i = 0; i < a_npages; i++)
2889 2906                          page_unlock(ppa[i]);
2890 2907                  if (sptd->spt_ppa != NULL)
2891 2908                          sptd->spt_flags |= DISM_PPA_CHANGED;
2892 2909                  mutex_exit(&sp->shm_mlock);
2893 2910                  mutex_exit(&sptd->spt_lock);
2894 2911  
2895 2912                  kmem_free(ppa, ((sizeof (page_t *)) * a_npages));
2896 2913  
2897 2914          } else if (op == MC_UNLOCK) { /* unlock */
2898 2915                  page_t          **ppa;
2899 2916  
2900 2917                  mutex_enter(&sptd->spt_lock);
2901 2918                  if (shmd->shm_lckpgs == 0) {
2902 2919                          mutex_exit(&sptd->spt_lock);
2903 2920                          return (0);
2904 2921                  }
2905 2922                  /*
2906 2923                   * Don't cache new IO pages.
2907 2924                   */
2908 2925                  if (sptd->spt_ppa != NULL)
2909 2926                          sptd->spt_flags |= DISM_PPA_CHANGED;
2910 2927  
2911 2928                  mutex_enter(&sp->shm_mlock);
2912 2929                  sts = spt_unlockpages(seg, an_idx, npages, &unlocked);
2913 2930                  if ((ppa = sptd->spt_ppa) != NULL)
2914 2931                          sptd->spt_flags |= DISM_PPA_CHANGED;
2915 2932                  mutex_exit(&sptd->spt_lock);
2916 2933  
2917 2934                  rctl_decr_locked_mem(NULL, proj, unlocked, 0);
2918 2935                  mutex_exit(&sp->shm_mlock);
2919 2936  
2920 2937                  if (ppa != NULL)
2921 2938                          seg_ppurge_wiredpp(ppa);
2922 2939          }
2923 2940          return (sts);
2924 2941  }
2925 2942  
2926 2943  /*ARGSUSED*/
2927 2944  int
2928 2945  segspt_shmgetprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv)
2929 2946  {
2930 2947          struct shm_data *shmd = (struct shm_data *)seg->s_data;
2931 2948          struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
2932 2949          spgcnt_t pgno = seg_page(seg, addr+len) - seg_page(seg, addr) + 1;
2933 2950  
2934 2951          ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
2935 2952  
2936 2953          /*
2937 2954           * ISM segment is always rw.
2938 2955           */
2939 2956          while (--pgno >= 0)
2940 2957                  *protv++ = sptd->spt_prot;
2941 2958          return (0);
2942 2959  }
2943 2960  
2944 2961  /*ARGSUSED*/
2945 2962  u_offset_t
2946 2963  segspt_shmgetoffset(struct seg *seg, caddr_t addr)
2947 2964  {
2948 2965          ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
2949 2966  
2950 2967          /* Offset does not matter in ISM memory */
2951 2968  
2952 2969          return ((u_offset_t)0);
2953 2970  }
2954 2971  
2955 2972  /* ARGSUSED */
2956 2973  int
2957 2974  segspt_shmgettype(struct seg *seg, caddr_t addr)
2958 2975  {
2959 2976          struct shm_data *shmd = (struct shm_data *)seg->s_data;
2960 2977          struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
2961 2978  
2962 2979          ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
2963 2980  
2964 2981          /*
2965 2982           * The shared memory mapping is always MAP_SHARED, SWAP is only
2966 2983           * reserved for DISM
2967 2984           */
2968 2985          return (MAP_SHARED |
2969 2986              ((sptd->spt_flags & SHM_PAGEABLE) ? 0 : MAP_NORESERVE));
2970 2987  }
2971 2988  
2972 2989  /*ARGSUSED*/
2973 2990  int
2974 2991  segspt_shmgetvp(struct seg *seg, caddr_t addr, struct vnode **vpp)
2975 2992  {
2976 2993          struct shm_data *shmd = (struct shm_data *)seg->s_data;
2977 2994          struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
2978 2995  
2979 2996          ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
2980 2997  
2981 2998          *vpp = sptd->spt_vp;
2982 2999          return (0);
2983 3000  }
2984 3001  
2985 3002  /*
2986 3003   * We need to wait for pending IO to complete to a DISM segment in order for
2987 3004   * pages to get kicked out of the seg_pcache.  120 seconds should be more
2988 3005   * than enough time to wait.
2989 3006   */
2990 3007  static clock_t spt_pcache_wait = 120;
2991 3008  
2992 3009  /*ARGSUSED*/
2993 3010  static int
2994 3011  segspt_shmadvise(struct seg *seg, caddr_t addr, size_t len, uint_t behav)
2995 3012  {
2996 3013          struct shm_data *shmd = (struct shm_data *)seg->s_data;
2997 3014          struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
2998 3015          struct anon_map *amp;
2999 3016          pgcnt_t pg_idx;
3000 3017          ushort_t gen;
3001 3018          clock_t end_lbolt;
3002 3019          int writer;
3003 3020          page_t **ppa;
3004 3021  
3005 3022          ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
3006 3023  
3007 3024          if (behav == MADV_FREE || behav == MADV_PURGE) {
3008 3025                  if ((sptd->spt_flags & SHM_PAGEABLE) == 0)
3009 3026                          return (0);
3010 3027  
3011 3028                  amp = sptd->spt_amp;
3012 3029                  pg_idx = seg_page(seg, addr);
3013 3030  
3014 3031                  mutex_enter(&sptd->spt_lock);
3015 3032                  if ((ppa = sptd->spt_ppa) == NULL) {
3016 3033                          mutex_exit(&sptd->spt_lock);
3017 3034                          ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
3018 3035                          (void) anon_disclaim(amp, pg_idx, len, behav, NULL);
3019 3036                          ANON_LOCK_EXIT(&amp->a_rwlock);
3020 3037                          return (0);
3021 3038                  }
3022 3039  
3023 3040                  sptd->spt_flags |= DISM_PPA_CHANGED;
3024 3041                  gen = sptd->spt_gen;
3025 3042  
3026 3043                  mutex_exit(&sptd->spt_lock);
3027 3044  
3028 3045                  /*
3029 3046                   * Purge all DISM cached pages
3030 3047                   */
3031 3048                  seg_ppurge_wiredpp(ppa);
3032 3049  
3033 3050                  /*
3034 3051                   * Drop the AS_LOCK so that other threads can grab it
3035 3052                   * in the as_pageunlock path and hopefully get the segment
3036 3053                   * kicked out of the seg_pcache.  We bump the shm_softlockcnt
3037 3054                   * to keep this segment resident.
3038 3055                   */
3039 3056                  writer = AS_WRITE_HELD(seg->s_as);
3040 3057                  atomic_inc_ulong((ulong_t *)(&(shmd->shm_softlockcnt)));
3041 3058                  AS_LOCK_EXIT(seg->s_as);
3042 3059  
3043 3060                  mutex_enter(&sptd->spt_lock);
3044 3061  
3045 3062                  end_lbolt = ddi_get_lbolt() + (hz * spt_pcache_wait);
3046 3063  
3047 3064                  /*
3048 3065                   * Try to wait for pages to get kicked out of the seg_pcache.
3049 3066                   */
3050 3067                  while (sptd->spt_gen == gen &&
3051 3068                      (sptd->spt_flags & DISM_PPA_CHANGED) &&
3052 3069                      ddi_get_lbolt() < end_lbolt) {
3053 3070                          if (!cv_timedwait_sig(&sptd->spt_cv,
3054 3071                              &sptd->spt_lock, end_lbolt)) {
3055 3072                                  break;
3056 3073                          }
3057 3074                  }
3058 3075  
3059 3076                  mutex_exit(&sptd->spt_lock);
3060 3077  
3061 3078                  /* Regrab the AS_LOCK and release our hold on the segment */
3062 3079                  AS_LOCK_ENTER(seg->s_as, writer ? RW_WRITER : RW_READER);
3063 3080                  atomic_dec_ulong((ulong_t *)(&(shmd->shm_softlockcnt)));
3064 3081                  if (shmd->shm_softlockcnt <= 0) {
3065 3082                          if (AS_ISUNMAPWAIT(seg->s_as)) {
3066 3083                                  mutex_enter(&seg->s_as->a_contents);
3067 3084                                  if (AS_ISUNMAPWAIT(seg->s_as)) {
3068 3085                                          AS_CLRUNMAPWAIT(seg->s_as);
3069 3086                                          cv_broadcast(&seg->s_as->a_cv);
3070 3087                                  }
3071 3088                                  mutex_exit(&seg->s_as->a_contents);
3072 3089                          }
3073 3090                  }
3074 3091  
3075 3092                  ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
3076 3093                  (void) anon_disclaim(amp, pg_idx, len, behav, NULL);
3077 3094                  ANON_LOCK_EXIT(&amp->a_rwlock);
3078 3095          } else if (lgrp_optimizations() && (behav == MADV_ACCESS_LWP ||
3079 3096              behav == MADV_ACCESS_MANY || behav == MADV_ACCESS_DEFAULT)) {
3080 3097                  int                     already_set;
3081 3098                  ulong_t                 anon_index;
3082 3099                  lgrp_mem_policy_t       policy;
3083 3100                  caddr_t                 shm_addr;
3084 3101                  size_t                  share_size;
3085 3102                  size_t                  size;
3086 3103                  struct seg              *sptseg = shmd->shm_sptseg;
3087 3104                  caddr_t                 sptseg_addr;
3088 3105  
3089 3106                  /*
3090 3107                   * Align address and length to page size of underlying segment
3091 3108                   */
3092 3109                  share_size = page_get_pagesize(shmd->shm_sptseg->s_szc);
3093 3110                  shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), share_size);
3094 3111                  size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)),
3095 3112                      share_size);
3096 3113  
3097 3114                  amp = shmd->shm_amp;
3098 3115                  anon_index = seg_page(seg, shm_addr);
3099 3116  
3100 3117                  /*
3101 3118                   * And now we may have to adjust size downward if we have
3102 3119                   * exceeded the realsize of the segment or initial anon
3103 3120                   * allocations.
3104 3121                   */
3105 3122                  sptseg_addr = sptseg->s_base + ptob(anon_index);
3106 3123                  if ((sptseg_addr + size) >
3107 3124                      (sptseg->s_base + sptd->spt_realsize))
3108 3125                          size = (sptseg->s_base + sptd->spt_realsize) -
3109 3126                              sptseg_addr;
3110 3127  
3111 3128                  /*
3112 3129                   * Set memory allocation policy for this segment
3113 3130                   */
3114 3131                  policy = lgrp_madv_to_policy(behav, len, MAP_SHARED);
3115 3132                  already_set = lgrp_shm_policy_set(policy, amp, anon_index,
3116 3133                      NULL, 0, len);
3117 3134  
3118 3135                  /*
3119 3136                   * If random memory allocation policy set already,
3120 3137                   * don't bother reapplying it.
3121 3138                   */
3122 3139                  if (already_set && !LGRP_MEM_POLICY_REAPPLICABLE(policy))
3123 3140                          return (0);
3124 3141  
3125 3142                  /*
3126 3143                   * Mark any existing pages in the given range for
3127 3144                   * migration, flushing the I/O page cache, and using
3128 3145                   * underlying segment to calculate anon index and get
3129 3146                   * anonmap and vnode pointer from
3130 3147                   */
3131 3148                  if (shmd->shm_softlockcnt > 0)
3132 3149                          segspt_purge(seg);
3133 3150  
3134 3151                  page_mark_migrate(seg, shm_addr, size, amp, 0, NULL, 0, 0);
3135 3152          }
3136 3153  
3137 3154          return (0);
3138 3155  }
3139 3156  
3140 3157  /*ARGSUSED*/
3141 3158  void
3142 3159  segspt_shmdump(struct seg *seg)
3143 3160  {
3144 3161          /* no-op for ISM segment */
3145 3162  }
3146 3163  
3147 3164  /*ARGSUSED*/
3148 3165  static int
3149 3166  segspt_shmsetpgsz(struct seg *seg, caddr_t addr, size_t len, uint_t szc)
3150 3167  {
3151 3168          return (ENOTSUP);
3152 3169  }
3153 3170  
3154 3171  /*
3155 3172   * get a memory ID for an addr in a given segment
3156 3173   */
3157 3174  static int
3158 3175  segspt_shmgetmemid(struct seg *seg, caddr_t addr, memid_t *memidp)
3159 3176  {
3160 3177          struct shm_data *shmd = (struct shm_data *)seg->s_data;
3161 3178          struct anon     *ap;
3162 3179          size_t          anon_index;
3163 3180          struct anon_map *amp = shmd->shm_amp;
3164 3181          struct spt_data *sptd = shmd->shm_sptseg->s_data;
3165 3182          struct seg      *sptseg = shmd->shm_sptseg;
3166 3183          anon_sync_obj_t cookie;
3167 3184  
3168 3185          anon_index = seg_page(seg, addr);
3169 3186  
3170 3187          if (addr > (seg->s_base + sptd->spt_realsize)) {
3171 3188                  return (EFAULT);
3172 3189          }
3173 3190  
3174 3191          ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
3175 3192          anon_array_enter(amp, anon_index, &cookie);
3176 3193          ap = anon_get_ptr(amp->ahp, anon_index);
3177 3194          if (ap == NULL) {
3178 3195                  struct page *pp;
3179 3196                  caddr_t spt_addr = sptseg->s_base + ptob(anon_index);
3180 3197  
3181 3198                  pp = anon_zero(sptseg, spt_addr, &ap, kcred);
3182 3199                  if (pp == NULL) {
3183 3200                          anon_array_exit(&cookie);
3184 3201                          ANON_LOCK_EXIT(&amp->a_rwlock);
3185 3202                          return (ENOMEM);
3186 3203                  }
3187 3204                  (void) anon_set_ptr(amp->ahp, anon_index, ap, ANON_SLEEP);
3188 3205                  page_unlock(pp);
3189 3206          }
3190 3207          anon_array_exit(&cookie);
3191 3208          ANON_LOCK_EXIT(&amp->a_rwlock);
3192 3209          memidp->val[0] = (uintptr_t)ap;
3193 3210          memidp->val[1] = (uintptr_t)addr & PAGEOFFSET;
3194 3211          return (0);
3195 3212  }
3196 3213  
3197 3214  /*
3198 3215   * Get memory allocation policy info for specified address in given segment
3199 3216   */
3200 3217  static lgrp_mem_policy_info_t *
3201 3218  segspt_shmgetpolicy(struct seg *seg, caddr_t addr)
3202 3219  {
3203 3220          struct anon_map         *amp;
3204 3221          ulong_t                 anon_index;
3205 3222          lgrp_mem_policy_info_t  *policy_info;
3206 3223          struct shm_data         *shm_data;
3207 3224  
3208 3225          ASSERT(seg != NULL);
3209 3226  
3210 3227          /*
3211 3228           * Get anon_map from segshm
3212 3229           *
3213 3230           * Assume that no lock needs to be held on anon_map, since
3214 3231           * it should be protected by its reference count which must be
3215 3232           * nonzero for an existing segment
3216 3233           * Need to grab readers lock on policy tree though
3217 3234           */
3218 3235          shm_data = (struct shm_data *)seg->s_data;
3219 3236          if (shm_data == NULL)
3220 3237                  return (NULL);
3221 3238          amp = shm_data->shm_amp;
3222 3239          ASSERT(amp->refcnt != 0);
3223 3240  
3224 3241          /*
3225 3242           * Get policy info
3226 3243           *
3227 3244           * Assume starting anon index of 0
3228 3245           */
3229 3246          anon_index = seg_page(seg, addr);
3230 3247          policy_info = lgrp_shm_policy_get(amp, anon_index, NULL, 0);
3231 3248  
3232 3249          return (policy_info);
3233 3250  }
3234 3251  
3235 3252  /*ARGSUSED*/
3236 3253  static int
3237 3254  segspt_shmcapable(struct seg *seg, segcapability_t capability)
3238 3255  {
3239 3256          return (0);
3240 3257  }

↓ open down ↓

2895 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX