1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 /*
  27  * Copyright (c) 2015, Joyent, Inc.  All rights reserved.
  28  */
  29 
  30 /*
  31  * Memory special file
  32  */
  33 
  34 #include <sys/types.h>
  35 #include <sys/param.h>
  36 #include <sys/user.h>
  37 #include <sys/buf.h>
  38 #include <sys/systm.h>
  39 #include <sys/cred.h>
  40 #include <sys/vm.h>
  41 #include <sys/uio.h>
  42 #include <sys/mman.h>
  43 #include <sys/kmem.h>
  44 #include <vm/seg.h>
  45 #include <vm/page.h>
  46 #include <sys/stat.h>
  47 #include <sys/vmem.h>
  48 #include <sys/memlist.h>
  49 #include <sys/bootconf.h>
  50 
  51 #include <vm/seg_vn.h>
  52 #include <vm/seg_dev.h>
  53 #include <vm/seg_kmem.h>
  54 #include <vm/seg_kp.h>
  55 #include <vm/seg_kpm.h>
  56 #include <vm/hat.h>
  57 
  58 #include <sys/conf.h>
  59 #include <sys/mem.h>
  60 #include <sys/types.h>
  61 #include <sys/conf.h>
  62 #include <sys/param.h>
  63 #include <sys/systm.h>
  64 #include <sys/errno.h>
  65 #include <sys/modctl.h>
  66 #include <sys/memlist.h>
  67 #include <sys/ddi.h>
  68 #include <sys/sunddi.h>
  69 #include <sys/debug.h>
  70 #include <sys/fm/protocol.h>
  71 
  72 #if defined(__sparc)
  73 extern int cpu_get_mem_name(uint64_t, uint64_t *, uint64_t, char *, int, int *);
  74 extern int cpu_get_mem_info(uint64_t, uint64_t, uint64_t *, uint64_t *,
  75     uint64_t *, int *, int *, int *);
  76 extern size_t cpu_get_name_bufsize(void);
  77 extern int cpu_get_mem_sid(char *, char *, int, int *);
  78 extern int cpu_get_mem_addr(char *, char *, uint64_t, uint64_t *);
  79 #elif defined(__x86)
  80 #include <sys/cpu_module.h>
  81 #endif  /* __sparc */
  82 
  83 /*
  84  * Turn a byte length into a pagecount.  The DDI btop takes a
  85  * 32-bit size on 32-bit machines, this handles 64-bit sizes for
  86  * large physical-memory 32-bit machines.
  87  */
  88 #define BTOP(x) ((pgcnt_t)((x) >> _pageshift))
  89 
  90 static kmutex_t mm_lock;
  91 static caddr_t mm_map;
  92 
  93 static dev_info_t *mm_dip;      /* private copy of devinfo pointer */
  94 
  95 static int mm_kmem_io_access;
  96 
  97 static int mm_kstat_update(kstat_t *ksp, int rw);
  98 static int mm_kstat_snapshot(kstat_t *ksp, void *buf, int rw);
  99 
 100 static int mm_read_mem_name(intptr_t data, mem_name_t *mem_name);
 101 
 102 #define MM_KMEMLOG_NENTRIES     64
 103 
 104 static int mm_kmemlogent;
 105 static mm_logentry_t mm_kmemlog[MM_KMEMLOG_NENTRIES];
 106 
 107 /*
 108  * On kmem/allmem writes, we log information that might be useful in the event
 109  * that a write is errant (that is, due to operator error) and induces a later
 110  * problem.  Note that (in particular) in the event of such operator-induced
 111  * corruption, a search over the kernel address space for the corrupted
 112  * address will yield the ring buffer entry that recorded the write.  And
 113  * should it seem baroque or otherwise unnecessary, yes, we need this kind of
 114  * auditing facility and yes, we learned that the hard way: disturbingly,
 115  * there exist recommendations for "tuning" the system that involve writing to
 116  * kernel memory addresses via the kernel debugger, and -- as we discovered --
 117  * these can easily be applied incorrectly or unsafely, yielding an entirely
 118  * undebuggable "can't happen" kind of panic.
 119  */
 120 static void
 121 mm_logkmem(struct uio *uio)
 122 {
 123         mm_logentry_t *ent;
 124         proc_t *p = curthread->t_procp;
 125 
 126         mutex_enter(&mm_lock);
 127 
 128         ent = &mm_kmemlog[mm_kmemlogent++];
 129 
 130         if (mm_kmemlogent == MM_KMEMLOG_NENTRIES)
 131                 mm_kmemlogent = 0;
 132 
 133         ent->mle_vaddr = (uintptr_t)uio->uio_loffset;
 134         ent->mle_len = uio->uio_resid;
 135         gethrestime(&ent->mle_hrestime);
 136         ent->mle_hrtime = gethrtime();
 137         ent->mle_pid = p->p_pidp->pid_id;
 138 
 139         (void) strncpy(ent->mle_psargs,
 140             p->p_user.u_psargs, sizeof (ent->mle_psargs));
 141 
 142         mutex_exit(&mm_lock);
 143 }
 144 
 145 /*ARGSUSED1*/
 146 static int
 147 mm_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
 148 {
 149         int i;
 150         struct mem_minor {
 151                 char *name;
 152                 minor_t minor;
 153                 int privonly;
 154                 const char *rdpriv;
 155                 const char *wrpriv;
 156                 mode_t priv_mode;
 157         } mm[] = {
 158                 { "mem",        M_MEM,          0,      NULL,   "all",  0640 },
 159                 { "kmem",       M_KMEM,         0,      NULL,   "all",  0640 },
 160                 { "allkmem",    M_ALLKMEM,      0,      "all",  "all",  0600 },
 161                 { "null",       M_NULL, PRIVONLY_DEV,   NULL,   NULL,   0666 },
 162                 { "zero",       M_ZERO, PRIVONLY_DEV,   NULL,   NULL,   0666 },
 163         };
 164         kstat_t *ksp;
 165 
 166         mutex_init(&mm_lock, NULL, MUTEX_DEFAULT, NULL);
 167         mm_map = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
 168 
 169         for (i = 0; i < (sizeof (mm) / sizeof (mm[0])); i++) {
 170                 if (ddi_create_priv_minor_node(devi, mm[i].name, S_IFCHR,
 171                     mm[i].minor, DDI_PSEUDO, mm[i].privonly,
 172                     mm[i].rdpriv, mm[i].wrpriv, mm[i].priv_mode) ==
 173                     DDI_FAILURE) {
 174                         ddi_remove_minor_node(devi, NULL);
 175                         return (DDI_FAILURE);
 176                 }
 177         }
 178 
 179         mm_dip = devi;
 180 
 181         ksp = kstat_create("mm", 0, "phys_installed", "misc",
 182             KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE | KSTAT_FLAG_VIRTUAL);
 183         if (ksp != NULL) {
 184                 ksp->ks_update = mm_kstat_update;
 185                 ksp->ks_snapshot = mm_kstat_snapshot;
 186                 ksp->ks_lock = &mm_lock; /* XXX - not really needed */
 187                 kstat_install(ksp);
 188         }
 189 
 190         mm_kmem_io_access = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS,
 191             "kmem_io_access", 0);
 192 
 193         return (DDI_SUCCESS);
 194 }
 195 
 196 /*ARGSUSED*/
 197 static int
 198 mm_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
 199 {
 200         register int error;
 201 
 202         switch (infocmd) {
 203         case DDI_INFO_DEVT2DEVINFO:
 204                 *result = (void *)mm_dip;
 205                 error = DDI_SUCCESS;
 206                 break;
 207         case DDI_INFO_DEVT2INSTANCE:
 208                 *result = (void *)0;
 209                 error = DDI_SUCCESS;
 210                 break;
 211         default:
 212                 error = DDI_FAILURE;
 213         }
 214         return (error);
 215 }
 216 
 217 /*ARGSUSED1*/
 218 static int
 219 mmopen(dev_t *devp, int flag, int typ, struct cred *cred)
 220 {
 221         switch (getminor(*devp)) {
 222         case M_NULL:
 223         case M_ZERO:
 224         case M_MEM:
 225         case M_KMEM:
 226         case M_ALLKMEM:
 227                 /* standard devices */
 228                 break;
 229 
 230         default:
 231                 /* Unsupported or unknown type */
 232                 return (EINVAL);
 233         }
 234         /* must be character device */
 235         if (typ != OTYP_CHR)
 236                 return (EINVAL);
 237         return (0);
 238 }
 239 
 240 struct pollhead mm_pollhd;
 241 
 242 /*ARGSUSED*/
 243 static int
 244 mmchpoll(dev_t dev, short events, int anyyet, short *reventsp,
 245     struct pollhead **phpp)
 246 {
 247         switch (getminor(dev)) {
 248         case M_NULL:
 249         case M_ZERO:
 250         case M_MEM:
 251         case M_KMEM:
 252         case M_ALLKMEM:
 253                 *reventsp = events & (POLLIN | POLLOUT | POLLPRI | POLLRDNORM |
 254                     POLLWRNORM | POLLRDBAND | POLLWRBAND);
 255                 /*
 256                  * A non NULL pollhead pointer should be returned in case
 257                  * user polls for 0 events.
 258                  */
 259                 *phpp = !anyyet && !*reventsp ?
 260                     &mm_pollhd : (struct pollhead *)NULL;
 261                 return (0);
 262         default:
 263                 /* no other devices currently support polling */
 264                 return (ENXIO);
 265         }
 266 }
 267 
 268 static int
 269 mmpropop(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int flags,
 270     char *name, caddr_t valuep, int *lengthp)
 271 {
 272         /*
 273          * implement zero size to reduce overhead (avoid two failing
 274          * property lookups per stat).
 275          */
 276         return (ddi_prop_op_size(dev, dip, prop_op,
 277             flags, name, valuep, lengthp, 0));
 278 }
 279 
 280 static int
 281 mmio(struct uio *uio, enum uio_rw rw, pfn_t pfn, off_t pageoff, int allowio,
 282     page_t *pp)
 283 {
 284         int error = 0;
 285         int devload = 0;
 286         int is_memory = pf_is_memory(pfn);
 287         size_t nbytes = MIN((size_t)(PAGESIZE - pageoff),
 288             (size_t)uio->uio_iov->iov_len);
 289         caddr_t va = NULL;
 290 
 291         mutex_enter(&mm_lock);
 292 
 293         if (is_memory && kpm_enable) {
 294                 if (pp)
 295                         va = hat_kpm_mapin(pp, NULL);
 296                 else
 297                         va = hat_kpm_mapin_pfn(pfn);
 298         }
 299 
 300         if (va == NULL) {
 301                 hat_devload(kas.a_hat, mm_map, PAGESIZE, pfn,
 302                     (uint_t)(rw == UIO_READ ? PROT_READ : PROT_READ|PROT_WRITE),
 303                     HAT_LOAD_NOCONSIST|HAT_LOAD_LOCK);
 304                 va = mm_map;
 305                 devload = 1;
 306         }
 307 
 308         if (!is_memory) {
 309                 if (allowio) {
 310                         size_t c = uio->uio_iov->iov_len;
 311 
 312                         if (ddi_peekpokeio(NULL, uio, rw,
 313                             (caddr_t)(uintptr_t)uio->uio_loffset, c,
 314                             sizeof (int32_t)) != DDI_SUCCESS)
 315                                 error = EFAULT;
 316                 } else
 317                         error = EIO;
 318         } else
 319                 error = uiomove(va + pageoff, nbytes, rw, uio);
 320 
 321         if (devload)
 322                 hat_unload(kas.a_hat, mm_map, PAGESIZE, HAT_UNLOAD_UNLOCK);
 323         else if (pp)
 324                 hat_kpm_mapout(pp, NULL, va);
 325         else
 326                 hat_kpm_mapout_pfn(pfn);
 327 
 328         mutex_exit(&mm_lock);
 329         return (error);
 330 }
 331 
 332 static int
 333 mmpagelock(struct as *as, caddr_t va)
 334 {
 335         struct seg *seg;
 336         int i;
 337 
 338         AS_LOCK_ENTER(as, RW_READER);
 339         seg = as_segat(as, va);
 340         i = (seg != NULL)? SEGOP_CAPABLE(seg, S_CAPABILITY_NOMINFLT) : 0;
 341         AS_LOCK_EXIT(as);
 342 
 343         return (i);
 344 }
 345 
 346 #ifdef  __sparc
 347 
 348 #define NEED_LOCK_KVADDR(kva)   mmpagelock(&kas, kva)
 349 
 350 #else   /* __i386, __amd64 */
 351 
 352 #define NEED_LOCK_KVADDR(va)    0
 353 
 354 #endif  /* __sparc */
 355 
 356 /*ARGSUSED3*/
 357 static int
 358 mmrw(dev_t dev, struct uio *uio, enum uio_rw rw, cred_t *cred)
 359 {
 360         pfn_t v;
 361         struct iovec *iov;
 362         int error = 0;
 363         size_t c;
 364         ssize_t oresid = uio->uio_resid;
 365         minor_t minor = getminor(dev);
 366 
 367         while (uio->uio_resid > 0 && error == 0) {
 368                 iov = uio->uio_iov;
 369                 if (iov->iov_len == 0) {
 370                         uio->uio_iov++;
 371                         uio->uio_iovcnt--;
 372                         if (uio->uio_iovcnt < 0)
 373                                 panic("mmrw");
 374                         continue;
 375                 }
 376                 switch (minor) {
 377 
 378                 case M_MEM:
 379                         memlist_read_lock();
 380                         if (!address_in_memlist(phys_install,
 381                             (uint64_t)uio->uio_loffset, 1)) {
 382                                 memlist_read_unlock();
 383                                 error = EFAULT;
 384                                 break;
 385                         }
 386                         memlist_read_unlock();
 387 
 388                         v = BTOP((u_offset_t)uio->uio_loffset);
 389                         error = mmio(uio, rw, v,
 390                             uio->uio_loffset & PAGEOFFSET, 0, NULL);
 391                         break;
 392 
 393                 case M_KMEM:
 394                 case M_ALLKMEM:
 395                         {
 396                         page_t **ppp = NULL;
 397                         caddr_t vaddr = (caddr_t)uio->uio_offset;
 398                         int try_lock = NEED_LOCK_KVADDR(vaddr);
 399                         int locked = 0;
 400 
 401                         if ((error = plat_mem_do_mmio(uio, rw)) != ENOTSUP)
 402                                 break;
 403 
 404                         if (rw == UIO_WRITE)
 405                                 mm_logkmem(uio);
 406 
 407                         /*
 408                          * If vaddr does not map a valid page, as_pagelock()
 409                          * will return failure. Hence we can't check the
 410                          * return value and return EFAULT here as we'd like.
 411                          * seg_kp and seg_kpm do not properly support
 412                          * as_pagelock() for this context so we avoid it
 413                          * using the try_lock set check above.  Some day when
 414                          * the kernel page locking gets redesigned all this
 415                          * muck can be cleaned up.
 416                          */
 417                         if (try_lock)
 418                                 locked = (as_pagelock(&kas, &ppp, vaddr,
 419                                     PAGESIZE, S_WRITE) == 0);
 420 
 421                         v = hat_getpfnum(kas.a_hat,
 422                             (caddr_t)(uintptr_t)uio->uio_loffset);
 423                         if (v == PFN_INVALID) {
 424                                 if (locked)
 425                                         as_pageunlock(&kas, ppp, vaddr,
 426                                             PAGESIZE, S_WRITE);
 427                                 error = EFAULT;
 428                                 break;
 429                         }
 430 
 431                         error = mmio(uio, rw, v, uio->uio_loffset & PAGEOFFSET,
 432                             minor == M_ALLKMEM || mm_kmem_io_access,
 433                             (locked && ppp) ? *ppp : NULL);
 434                         if (locked)
 435                                 as_pageunlock(&kas, ppp, vaddr, PAGESIZE,
 436                                     S_WRITE);
 437                         }
 438 
 439                         break;
 440 
 441                 case M_ZERO:
 442                         if (rw == UIO_READ) {
 443                                 label_t ljb;
 444 
 445                                 if (on_fault(&ljb)) {
 446                                         no_fault();
 447                                         error = EFAULT;
 448                                         break;
 449                                 }
 450                                 uzero(iov->iov_base, iov->iov_len);
 451                                 no_fault();
 452                                 uio->uio_resid -= iov->iov_len;
 453                                 uio->uio_loffset += iov->iov_len;
 454                                 break;
 455                         }
 456                         /* else it's a write, fall through to NULL case */
 457                         /*FALLTHROUGH*/
 458 
 459                 case M_NULL:
 460                         if (rw == UIO_READ)
 461                                 return (0);
 462                         c = iov->iov_len;
 463                         iov->iov_base += c;
 464                         iov->iov_len -= c;
 465                         uio->uio_loffset += c;
 466                         uio->uio_resid -= c;
 467                         break;
 468 
 469                 }
 470         }
 471         return (uio->uio_resid == oresid ? error : 0);
 472 }
 473 
 474 static int
 475 mmread(dev_t dev, struct uio *uio, cred_t *cred)
 476 {
 477         return (mmrw(dev, uio, UIO_READ, cred));
 478 }
 479 
 480 static int
 481 mmwrite(dev_t dev, struct uio *uio, cred_t *cred)
 482 {
 483         return (mmrw(dev, uio, UIO_WRITE, cred));
 484 }
 485 
 486 /*
 487  * Private ioctl for libkvm to support kvm_physaddr().
 488  * Given an address space and a VA, compute the PA.
 489  */
 490 static int
 491 mmioctl_vtop(intptr_t data)
 492 {
 493 #ifdef _SYSCALL32
 494         mem_vtop32_t vtop32;
 495 #endif
 496         mem_vtop_t mem_vtop;
 497         proc_t *p;
 498         pfn_t pfn = (pfn_t)PFN_INVALID;
 499         pid_t pid = 0;
 500         struct as *as;
 501         struct seg *seg;
 502 
 503         if (get_udatamodel() == DATAMODEL_NATIVE) {
 504                 if (copyin((void *)data, &mem_vtop, sizeof (mem_vtop_t)))
 505                         return (EFAULT);
 506         }
 507 #ifdef _SYSCALL32
 508         else {
 509                 if (copyin((void *)data, &vtop32, sizeof (mem_vtop32_t)))
 510                         return (EFAULT);
 511                 mem_vtop.m_as = (struct as *)(uintptr_t)vtop32.m_as;
 512                 mem_vtop.m_va = (void *)(uintptr_t)vtop32.m_va;
 513 
 514                 if (mem_vtop.m_as != NULL)
 515                         return (EINVAL);
 516         }
 517 #endif
 518 
 519         if (mem_vtop.m_as == &kas) {
 520                 pfn = hat_getpfnum(kas.a_hat, mem_vtop.m_va);
 521         } else {
 522                 if (mem_vtop.m_as == NULL) {
 523                         /*
 524                          * Assume the calling process's address space if the
 525                          * caller didn't specify one.
 526                          */
 527                         p = curthread->t_procp;
 528                         if (p == NULL)
 529                                 return (EIO);
 530                         mem_vtop.m_as = p->p_as;
 531                 }
 532 
 533                 mutex_enter(&pidlock);
 534                 for (p = practive; p != NULL; p = p->p_next) {
 535                         if (p->p_as == mem_vtop.m_as) {
 536                                 pid = p->p_pid;
 537                                 break;
 538                         }
 539                 }
 540                 mutex_exit(&pidlock);
 541                 if (p == NULL)
 542                         return (EIO);
 543                 p = sprlock(pid);
 544                 if (p == NULL)
 545                         return (EIO);
 546                 as = p->p_as;
 547                 if (as == mem_vtop.m_as) {
 548                         mutex_exit(&p->p_lock);
 549                         AS_LOCK_ENTER(as, RW_READER);
 550                         for (seg = AS_SEGFIRST(as); seg != NULL;
 551                             seg = AS_SEGNEXT(as, seg))
 552                                 if ((uintptr_t)mem_vtop.m_va -
 553                                     (uintptr_t)seg->s_base < seg->s_size)
 554                                         break;
 555                         if (seg != NULL)
 556                                 pfn = hat_getpfnum(as->a_hat, mem_vtop.m_va);
 557                         AS_LOCK_EXIT(as);
 558                         mutex_enter(&p->p_lock);
 559                 }
 560                 sprunlock(p);
 561         }
 562         mem_vtop.m_pfn = pfn;
 563         if (pfn == PFN_INVALID)
 564                 return (EIO);
 565 
 566         if (get_udatamodel() == DATAMODEL_NATIVE) {
 567                 if (copyout(&mem_vtop, (void *)data, sizeof (mem_vtop_t)))
 568                         return (EFAULT);
 569         }
 570 #ifdef _SYSCALL32
 571         else {
 572                 vtop32.m_pfn = mem_vtop.m_pfn;
 573                 if (copyout(&vtop32, (void *)data, sizeof (mem_vtop32_t)))
 574                         return (EFAULT);
 575         }
 576 #endif
 577 
 578         return (0);
 579 }
 580 
 581 /*
 582  * Given a PA, execute the given page retire command on it.
 583  */
 584 static int
 585 mmioctl_page_retire(int cmd, intptr_t data)
 586 {
 587         extern int page_retire_test(void);
 588         uint64_t pa;
 589 
 590         if (copyin((void *)data, &pa, sizeof (uint64_t))) {
 591                 return (EFAULT);
 592         }
 593 
 594         switch (cmd) {
 595         case MEM_PAGE_ISRETIRED:
 596                 return (page_retire_check(pa, NULL));
 597 
 598         case MEM_PAGE_UNRETIRE:
 599                 return (page_unretire(pa));
 600 
 601         case MEM_PAGE_RETIRE:
 602                 return (page_retire(pa, PR_FMA));
 603 
 604         case MEM_PAGE_RETIRE_MCE:
 605                 return (page_retire(pa, PR_MCE));
 606 
 607         case MEM_PAGE_RETIRE_UE:
 608                 return (page_retire(pa, PR_UE));
 609 
 610         case MEM_PAGE_GETERRORS:
 611                 {
 612                         uint64_t page_errors;
 613                         int rc = page_retire_check(pa, &page_errors);
 614                         if (copyout(&page_errors, (void *)data,
 615                             sizeof (uint64_t))) {
 616                                 return (EFAULT);
 617                         }
 618                         return (rc);
 619                 }
 620 
 621         case MEM_PAGE_RETIRE_TEST:
 622                 return (page_retire_test());
 623 
 624         }
 625 
 626         return (EINVAL);
 627 }
 628 
 629 #ifdef __sparc
 630 /*
 631  * Given a syndrome, syndrome type, and address return the
 632  * associated memory name in the provided data buffer.
 633  */
 634 static int
 635 mmioctl_get_mem_name(intptr_t data)
 636 {
 637         mem_name_t mem_name;
 638         void *buf;
 639         size_t bufsize;
 640         int len, err;
 641 
 642         if ((bufsize = cpu_get_name_bufsize()) == 0)
 643                 return (ENOTSUP);
 644 
 645         if ((err = mm_read_mem_name(data, &mem_name)) < 0)
 646                 return (err);
 647 
 648         buf = kmem_alloc(bufsize, KM_SLEEP);
 649 
 650         /*
 651          * Call into cpu specific code to do the lookup.
 652          */
 653         if ((err = cpu_get_mem_name(mem_name.m_synd, mem_name.m_type,
 654             mem_name.m_addr, buf, bufsize, &len)) != 0) {
 655                 kmem_free(buf, bufsize);
 656                 return (err);
 657         }
 658 
 659         if (len >= mem_name.m_namelen) {
 660                 kmem_free(buf, bufsize);
 661                 return (ENOSPC);
 662         }
 663 
 664         if (copyoutstr(buf, (char *)mem_name.m_name,
 665             mem_name.m_namelen, NULL) != 0) {
 666                 kmem_free(buf, bufsize);
 667                 return (EFAULT);
 668         }
 669 
 670         kmem_free(buf, bufsize);
 671         return (0);
 672 }
 673 
 674 /*
 675  * Given a syndrome and address return information about the associated memory.
 676  */
 677 static int
 678 mmioctl_get_mem_info(intptr_t data)
 679 {
 680         mem_info_t mem_info;
 681         int err;
 682 
 683         if (copyin((void *)data, &mem_info, sizeof (mem_info_t)))
 684                 return (EFAULT);
 685 
 686         if ((err = cpu_get_mem_info(mem_info.m_synd, mem_info.m_addr,
 687             &mem_info.m_mem_size, &mem_info.m_seg_size, &mem_info.m_bank_size,
 688             &mem_info.m_segments, &mem_info.m_banks, &mem_info.m_mcid)) != 0)
 689                 return (err);
 690 
 691         if (copyout(&mem_info, (void *)data, sizeof (mem_info_t)) != 0)
 692                 return (EFAULT);
 693 
 694         return (0);
 695 }
 696 
 697 /*
 698  * Given a memory name, return its associated serial id
 699  */
 700 static int
 701 mmioctl_get_mem_sid(intptr_t data)
 702 {
 703         mem_name_t mem_name;
 704         void *buf;
 705         void *name;
 706         size_t  name_len;
 707         size_t bufsize;
 708         int len, err;
 709 
 710         if ((bufsize = cpu_get_name_bufsize()) == 0)
 711                 return (ENOTSUP);
 712 
 713         if ((err = mm_read_mem_name(data, &mem_name)) < 0)
 714                 return (err);
 715 
 716         buf = kmem_alloc(bufsize, KM_SLEEP);
 717 
 718         if (mem_name.m_namelen > 1024)
 719                 mem_name.m_namelen = 1024; /* cap at 1024 bytes */
 720 
 721         name = kmem_alloc(mem_name.m_namelen, KM_SLEEP);
 722 
 723         if ((err = copyinstr((char *)mem_name.m_name, (char *)name,
 724             mem_name.m_namelen, &name_len)) != 0) {
 725                 kmem_free(buf, bufsize);
 726                 kmem_free(name, mem_name.m_namelen);
 727                 return (err);
 728         }
 729 
 730         /*
 731          * Call into cpu specific code to do the lookup.
 732          */
 733         if ((err = cpu_get_mem_sid(name, buf, bufsize, &len)) != 0) {
 734                 kmem_free(buf, bufsize);
 735                 kmem_free(name, mem_name.m_namelen);
 736                 return (err);
 737         }
 738 
 739         if (len > mem_name.m_sidlen) {
 740                 kmem_free(buf, bufsize);
 741                 kmem_free(name, mem_name.m_namelen);
 742                 return (ENAMETOOLONG);
 743         }
 744 
 745         if (copyoutstr(buf, (char *)mem_name.m_sid,
 746             mem_name.m_sidlen, NULL) != 0) {
 747                 kmem_free(buf, bufsize);
 748                 kmem_free(name, mem_name.m_namelen);
 749                 return (EFAULT);
 750         }
 751 
 752         kmem_free(buf, bufsize);
 753         kmem_free(name, mem_name.m_namelen);
 754         return (0);
 755 }
 756 #endif  /* __sparc */
 757 
 758 /*
 759  * Private ioctls for
 760  *      libkvm to support kvm_physaddr().
 761  *      FMA support for page_retire() and memory attribute information.
 762  */
 763 /*ARGSUSED*/
 764 static int
 765 mmioctl(dev_t dev, int cmd, intptr_t data, int flag, cred_t *cred, int *rvalp)
 766 {
 767         if ((cmd == MEM_VTOP && getminor(dev) != M_KMEM) ||
 768             (cmd != MEM_VTOP && getminor(dev) != M_MEM))
 769                 return (ENXIO);
 770 
 771         switch (cmd) {
 772         case MEM_VTOP:
 773                 return (mmioctl_vtop(data));
 774 
 775         case MEM_PAGE_RETIRE:
 776         case MEM_PAGE_ISRETIRED:
 777         case MEM_PAGE_UNRETIRE:
 778         case MEM_PAGE_RETIRE_MCE:
 779         case MEM_PAGE_RETIRE_UE:
 780         case MEM_PAGE_GETERRORS:
 781         case MEM_PAGE_RETIRE_TEST:
 782                 return (mmioctl_page_retire(cmd, data));
 783 
 784 #ifdef __sparc
 785         case MEM_NAME:
 786                 return (mmioctl_get_mem_name(data));
 787 
 788         case MEM_INFO:
 789                 return (mmioctl_get_mem_info(data));
 790 
 791         case MEM_SID:
 792                 return (mmioctl_get_mem_sid(data));
 793 #else
 794         case MEM_NAME:
 795         case MEM_INFO:
 796         case MEM_SID:
 797                 return (ENOTSUP);
 798 #endif  /* __sparc */
 799         }
 800         return (ENXIO);
 801 }
 802 
 803 /*ARGSUSED2*/
 804 static int
 805 mmmmap(dev_t dev, off_t off, int prot)
 806 {
 807         pfn_t pf;
 808         struct memlist *pmem;
 809         minor_t minor = getminor(dev);
 810 
 811         switch (minor) {
 812         case M_MEM:
 813                 pf = btop(off);
 814                 memlist_read_lock();
 815                 for (pmem = phys_install; pmem != NULL; pmem = pmem->ml_next) {
 816                         if (pf >= BTOP(pmem->ml_address) &&
 817                             pf < BTOP(pmem->ml_address + pmem->ml_size)) {
 818                                 memlist_read_unlock();
 819                                 return (impl_obmem_pfnum(pf));
 820                         }
 821                 }
 822                 memlist_read_unlock();
 823                 break;
 824 
 825         case M_KMEM:
 826         case M_ALLKMEM:
 827                 /* no longer supported with KPR */
 828                 return (-1);
 829 
 830         case M_ZERO:
 831                 /*
 832                  * We shouldn't be mmap'ing to /dev/zero here as
 833                  * mmsegmap() should have already converted
 834                  * a mapping request for this device to a mapping
 835                  * using seg_vn for anonymous memory.
 836                  */
 837                 break;
 838 
 839         }
 840         return (-1);
 841 }
 842 
 843 /*
 844  * This function is called when a memory device is mmap'ed.
 845  * Set up the mapping to the correct device driver.
 846  */
 847 static int
 848 mmsegmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len,
 849     uint_t prot, uint_t maxprot, uint_t flags, struct cred *cred)
 850 {
 851         struct segvn_crargs vn_a;
 852         struct segdev_crargs dev_a;
 853         int error;
 854         minor_t minor;
 855         off_t i;
 856 
 857         minor = getminor(dev);
 858 
 859         as_rangelock(as);
 860         /*
 861          * No need to worry about vac alignment on /dev/zero
 862          * since this is a "clone" object that doesn't yet exist.
 863          */
 864         error = choose_addr(as, addrp, len, off,
 865             (minor == M_MEM) || (minor == M_KMEM), flags);
 866         if (error != 0) {
 867                 as_rangeunlock(as);
 868                 return (error);
 869         }
 870 
 871         switch (minor) {
 872         case M_MEM:
 873                 /* /dev/mem cannot be mmap'ed with MAP_PRIVATE */
 874                 if ((flags & MAP_TYPE) != MAP_SHARED) {
 875                         as_rangeunlock(as);
 876                         return (EINVAL);
 877                 }
 878 
 879                 /*
 880                  * Check to ensure that the entire range is
 881                  * legal and we are not trying to map in
 882                  * more than the device will let us.
 883                  */
 884                 for (i = 0; i < len; i += PAGESIZE) {
 885                         if (mmmmap(dev, off + i, maxprot) == -1) {
 886                                 as_rangeunlock(as);
 887                                 return (ENXIO);
 888                         }
 889                 }
 890 
 891                 /*
 892                  * Use seg_dev segment driver for /dev/mem mapping.
 893                  */
 894                 dev_a.mapfunc = mmmmap;
 895                 dev_a.dev = dev;
 896                 dev_a.offset = off;
 897                 dev_a.type = (flags & MAP_TYPE);
 898                 dev_a.prot = (uchar_t)prot;
 899                 dev_a.maxprot = (uchar_t)maxprot;
 900                 dev_a.hat_attr = 0;
 901 
 902                 /*
 903                  * Make /dev/mem mappings non-consistent since we can't
 904                  * alias pages that don't have page structs behind them,
 905                  * such as kernel stack pages. If someone mmap()s a kernel
 906                  * stack page and if we give him a tte with cv, a line from
 907                  * that page can get into both pages of the spitfire d$.
 908                  * But snoop from another processor will only invalidate
 909                  * the first page. This later caused kernel (xc_attention)
 910                  * to go into an infinite loop at pil 13 and no interrupts
 911                  * could come in. See 1203630.
 912                  *
 913                  */
 914                 dev_a.hat_flags = HAT_LOAD_NOCONSIST;
 915                 dev_a.devmap_data = NULL;
 916 
 917                 error = as_map(as, *addrp, len, segdev_create, &dev_a);
 918                 break;
 919 
 920         case M_ZERO:
 921                 /*
 922                  * Use seg_vn segment driver for /dev/zero mapping.
 923                  * Passing in a NULL amp gives us the "cloning" effect.
 924                  */
 925                 vn_a.vp = NULL;
 926                 vn_a.offset = 0;
 927                 vn_a.type = (flags & MAP_TYPE);
 928                 vn_a.prot = prot;
 929                 vn_a.maxprot = maxprot;
 930                 vn_a.flags = flags & ~MAP_TYPE;
 931                 vn_a.cred = cred;
 932                 vn_a.amp = NULL;
 933                 vn_a.szc = 0;
 934                 vn_a.lgrp_mem_policy_flags = 0;
 935                 error = as_map(as, *addrp, len, segvn_create, &vn_a);
 936                 break;
 937 
 938         case M_KMEM:
 939         case M_ALLKMEM:
 940                 /* No longer supported with KPR. */
 941                 error = ENXIO;
 942                 break;
 943 
 944         case M_NULL:
 945                 /*
 946                  * Use seg_dev segment driver for /dev/null mapping.
 947                  */
 948                 dev_a.mapfunc = mmmmap;
 949                 dev_a.dev = dev;
 950                 dev_a.offset = off;
 951                 dev_a.type = 0;         /* neither PRIVATE nor SHARED */
 952                 dev_a.prot = dev_a.maxprot = (uchar_t)PROT_NONE;
 953                 dev_a.hat_attr = 0;
 954                 dev_a.hat_flags = 0;
 955                 error = as_map(as, *addrp, len, segdev_create, &dev_a);
 956                 break;
 957 
 958         default:
 959                 error = ENXIO;
 960         }
 961 
 962         as_rangeunlock(as);
 963         return (error);
 964 }
 965 
 966 static struct cb_ops mm_cb_ops = {
 967         mmopen,                 /* open */
 968         nulldev,                /* close */
 969         nodev,                  /* strategy */
 970         nodev,                  /* print */
 971         nodev,                  /* dump */
 972         mmread,                 /* read */
 973         mmwrite,                /* write */
 974         mmioctl,                /* ioctl */
 975         nodev,                  /* devmap */
 976         mmmmap,                 /* mmap */
 977         mmsegmap,               /* segmap */
 978         mmchpoll,               /* poll */
 979         mmpropop,               /* prop_op */
 980         0,                      /* streamtab  */
 981         D_NEW | D_MP | D_64BIT | D_U64BIT
 982 };
 983 
 984 static struct dev_ops mm_ops = {
 985         DEVO_REV,               /* devo_rev, */
 986         0,                      /* refcnt  */
 987         mm_info,                /* get_dev_info */
 988         nulldev,                /* identify */
 989         nulldev,                /* probe */
 990         mm_attach,              /* attach */
 991         nodev,                  /* detach */
 992         nodev,                  /* reset */
 993         &mm_cb_ops,         /* driver operations */
 994         (struct bus_ops *)0,    /* bus operations */
 995         NULL,                   /* power */
 996         ddi_quiesce_not_needed,         /* quiesce */
 997 };
 998 
 999 static struct modldrv modldrv = {
1000         &mod_driverops, "memory driver", &mm_ops,
1001 };
1002 
1003 static struct modlinkage modlinkage = {
1004         MODREV_1, { &modldrv, NULL }
1005 };
1006 
1007 int
1008 _init(void)
1009 {
1010         return (mod_install(&modlinkage));
1011 }
1012 
1013 int
1014 _info(struct modinfo *modinfop)
1015 {
1016         return (mod_info(&modlinkage, modinfop));
1017 }
1018 
1019 int
1020 _fini(void)
1021 {
1022         return (mod_remove(&modlinkage));
1023 }
1024 
1025 static int
1026 mm_kstat_update(kstat_t *ksp, int rw)
1027 {
1028         struct memlist *pmem;
1029         uint_t count;
1030 
1031         if (rw == KSTAT_WRITE)
1032                 return (EACCES);
1033 
1034         count = 0;
1035         memlist_read_lock();
1036         for (pmem = phys_install; pmem != NULL; pmem = pmem->ml_next) {
1037                 count++;
1038         }
1039         memlist_read_unlock();
1040 
1041         ksp->ks_ndata = count;
1042         ksp->ks_data_size = count * 2 * sizeof (uint64_t);
1043 
1044         return (0);
1045 }
1046 
1047 static int
1048 mm_kstat_snapshot(kstat_t *ksp, void *buf, int rw)
1049 {
1050         struct memlist *pmem;
1051         struct memunit {
1052                 uint64_t address;
1053                 uint64_t size;
1054         } *kspmem;
1055 
1056         if (rw == KSTAT_WRITE)
1057                 return (EACCES);
1058 
1059         ksp->ks_snaptime = gethrtime();
1060 
1061         kspmem = (struct memunit *)buf;
1062         memlist_read_lock();
1063         for (pmem = phys_install; pmem != NULL;
1064             pmem = pmem->ml_next, kspmem++) {
1065                 if ((caddr_t)kspmem >= (caddr_t)buf + ksp->ks_data_size)
1066                         break;
1067                 kspmem->address = pmem->ml_address;
1068                 kspmem->size = pmem->ml_size;
1069         }
1070         memlist_read_unlock();
1071 
1072         return (0);
1073 }
1074 
1075 /*
1076  * Read a mem_name_t from user-space and store it in the mem_name_t
1077  * pointed to by the mem_name argument.
1078  */
1079 static int
1080 mm_read_mem_name(intptr_t data, mem_name_t *mem_name)
1081 {
1082         if (get_udatamodel() == DATAMODEL_NATIVE) {
1083                 if (copyin((void *)data, mem_name, sizeof (mem_name_t)))
1084                         return (EFAULT);
1085         }
1086 #ifdef  _SYSCALL32
1087         else {
1088                 mem_name32_t mem_name32;
1089 
1090                 if (copyin((void *)data, &mem_name32, sizeof (mem_name32_t)))
1091                         return (EFAULT);
1092                 mem_name->m_addr = mem_name32.m_addr;
1093                 mem_name->m_synd = mem_name32.m_synd;
1094                 mem_name->m_type[0] = mem_name32.m_type[0];
1095                 mem_name->m_type[1] = mem_name32.m_type[1];
1096                 mem_name->m_name = (caddr_t)(uintptr_t)mem_name32.m_name;
1097                 mem_name->m_namelen = (size_t)mem_name32.m_namelen;
1098                 mem_name->m_sid = (caddr_t)(uintptr_t)mem_name32.m_sid;
1099                 mem_name->m_sidlen = (size_t)mem_name32.m_sidlen;
1100         }
1101 #endif  /* _SYSCALL32 */
1102 
1103         return (0);
1104 }